Merge branch 'devel' of github.com:SheffieldML/GPy into devel

2026-04-27 22:06:22 +02:00 · 2014-07-11 10:43:02 +01:00 · 2014-07-11 10:43:02 +01:00 · 369cc0ba2b
commit 369cc0ba2b
parent 6877b21fad 08555c5f56
25 changed files with 737 additions and 277 deletions
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -9,6 +9,8 @@ import numpy as np
 from ...util.misc import param_to_array
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
+import logging, itertools
+logger = logging.getLogger('vardtc')

 class VarDTC(LatentFunctionInference):
    """
@ -36,11 +38,11 @@ class VarDTC(LatentFunctionInference):
        return param_to_array(np.sum(np.square(Y)))

    def __getstate__(self):
-        # has to be overridden, as Cacher objects cannot be pickled. 
+        # has to be overridden, as Cacher objects cannot be pickled.
        return self.limit

    def __setstate__(self, state):
-        # has to be overridden, as Cacher objects cannot be pickled. 
+        # has to be overridden, as Cacher objects cannot be pickled.
        self.limit = state
        from ...util.caching import Cacher
        self.get_trYYT = Cacher(self._get_trYYT, self.limit)
@ -196,18 +198,19 @@ class VarDTCMissingData(LatentFunctionInference):
    def __init__(self, limit=1, inan=None):
        from ...util.caching import Cacher
        self._Y = Cacher(self._subarray_computations, limit)
-        self._inan = inan
+        if inan is not None: self._inan = ~inan
+        else: self._inan = None
        pass

    def set_limit(self, limit):
        self._Y.limit = limit

    def __getstate__(self):
-        # has to be overridden, as Cacher objects cannot be pickled. 
+        # has to be overridden, as Cacher objects cannot be pickled.
        return self._Y.limit, self._inan

    def __setstate__(self, state):
-        # has to be overridden, as Cacher objects cannot be pickled. 
+        # has to be overridden, as Cacher objects cannot be pickled.
        from ...util.caching import Cacher
        self.limit = state[0]
        self._inan = state[1]
@ -217,21 +220,35 @@ class VarDTCMissingData(LatentFunctionInference):
        if self._inan is None:
            inan = np.isnan(Y)
            has_none = inan.any()
+            self._inan = ~inan
        else:
            inan = self._inan
            has_none = True
        if has_none:
-            from ...util.subarray_and_sorting import common_subarrays
-            self._subarray_indices = []
-            for v,ind in common_subarrays(inan, 1).iteritems():
-                if not np.all(v):
-                    v = ~np.array(v, dtype=bool)
-                    ind = np.array(ind, dtype=int)
-                    if ind.size == Y.shape[1]:
-                        ind = slice(None)
-                    self._subarray_indices.append([v,ind])
-            Ys = [Y[v, :][:, ind] for v, ind in self._subarray_indices]
-            traces = [(y**2).sum() for y in Ys]
+            #print "caching missing data slices, this can take several minutes depending on the number of unique dimensions of the data..."
+            #csa = common_subarrays(inan, 1)
+            size = Y.shape[1]
+            #logger.info('preparing subarrays {:3.3%}'.format((i+1.)/size))
+            Ys = []
+            next_ten = [0.]
+            count = itertools.count()
+            for v, y in itertools.izip(inan.T, Y.T[:,:,None]):
+                i = count.next()
+                if ((i+1.)/size) >= next_ten[0]:
+                    logger.info('preparing subarrays {:>6.1%}'.format((i+1.)/size))
+                    next_ten[0] += .1
+                Ys.append(y[v,:])
+
+            next_ten = [0.]
+            count = itertools.count()
+            def trace(y):
+                i = count.next()
+                if ((i+1.)/size) >= next_ten[0]:
+                    logger.info('preparing traces {:>6.1%}'.format((i+1.)/size))
+                    next_ten[0] += .1
+                y = y[inan[:,i],i:i+1]
+                return np.einsum('ij,ij->', y,y)
+            traces = [trace(Y) for _ in xrange(size)]
            return Ys, traces
        else:
            self._subarray_indices = [[slice(None),slice(None)]]
@ -253,7 +270,6 @@ class VarDTCMissingData(LatentFunctionInference):
        beta_all = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6)
        het_noise = beta_all.size != 1

-        import itertools
        num_inducing = Z.shape[0]

        dL_dpsi0_all = np.zeros(Y.shape[0])
@ -273,22 +289,24 @@ class VarDTCMissingData(LatentFunctionInference):
        Lm = jitchol(Kmm)
        if uncertain_inputs: LmInv = dtrtri(Lm)

-        VVT_factor_all = np.empty(Y.shape)
-        full_VVT_factor = VVT_factor_all.shape[1] == Y.shape[1]
-        if not full_VVT_factor:
-            psi1V = np.dot(Y.T*beta_all, psi1_all).T
+        #VVT_factor_all = np.empty(Y.shape)
+        #full_VVT_factor = VVT_factor_all.shape[1] == Y.shape[1]
+        #if not full_VVT_factor:
+        #    psi1V = np.dot(Y.T*beta_all, psi1_all).T

-        for y, trYYT, [v, ind] in itertools.izip(Ys, traces, self._subarray_indices):
-            if het_noise: beta = beta_all[ind]
+        #logger.info('computing dimension-wise likelihood and derivatives')
+        #size = len(Ys)
+        size = Y.shape[1]
+        next_ten = 0
+        for i, [y, v, trYYT] in enumerate(itertools.izip(Ys, self._inan.T, traces)):
+            if ((i+1.)/size) >= next_ten:
+                logger.info('inference {:> 6.1%}'.format((i+1.)/size))
+                next_ten += .1
+            if het_noise: beta = beta_all[i]
            else: beta = beta_all

-            VVT_factor = (beta*y)
-            try:
-                VVT_factor_all[v, ind].flat = VVT_factor.flat
-            except ValueError:
-                mult = np.ravel_multi_index((v.nonzero()[0][:,None],ind[None,:]), VVT_factor_all.shape)
-                VVT_factor_all.flat[mult] = VVT_factor
-            output_dim = y.shape[1]
+            VVT_factor = (y*beta)
+            output_dim = 1#len(ind)

            psi0 = psi0_all[v]
            psi1 = psi1_all[v, :]
@ -347,19 +365,20 @@ class VarDTCMissingData(LatentFunctionInference):
                psi0, psi1, beta,
                data_fit, num_data, output_dim, trYYT, Y)

-            if full_VVT_factor: woodbury_vector[:, ind] = Cpsi1Vf
-            else:
-                print 'foobar'
-                tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
-                tmp, _ = dpotrs(LB, tmp, lower=1)
-                woodbury_vector[:, ind] = dtrtrs(Lm, tmp, lower=1, trans=1)[0]
+            #if full_VVT_factor:
+            woodbury_vector[:, i:i+1] = Cpsi1Vf
+            #else:
+            #    print 'foobar'
+            #    tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
+            #    tmp, _ = dpotrs(LB, tmp, lower=1)
+            #    woodbury_vector[:, ind] = dtrtrs(Lm, tmp, lower=1, trans=1)[0]

            #import ipdb;ipdb.set_trace()
            Bi, _ = dpotri(LB, lower=1)
            symmetrify(Bi)
            Bi = -dpotri(LB, lower=1)[0]
            diag.add(Bi, 1)
-            woodbury_inv_all[:, :, ind] = backsub_both_sides(Lm, Bi)[:,:,None]
+            woodbury_inv_all[:, :, i:i+1] = backsub_both_sides(Lm, Bi)[:,:,None]

        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)

@ -376,23 +395,6 @@ class VarDTCMissingData(LatentFunctionInference):
                         'dL_dKnm':dL_dpsi1_all,
                         'dL_dthetaL':dL_dthetaL}

-        #get sufficient things for posterior prediction
-        #TODO: do we really want to do this in  the loop?
-        #if not full_VVT_factor:
-        #    print 'foobar'
-        #    psi1V = np.dot(Y.T*beta_all, psi1_all).T
-        #    tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
-        #    tmp, _ = dpotrs(LB_all, tmp, lower=1)
-        #    woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
-        #import ipdb;ipdb.set_trace()
-        #Bi, _ = dpotri(LB_all, lower=1)
-        #symmetrify(Bi)
-        #Bi = -dpotri(LB_all, lower=1)[0]
-        #from ...util import diag
-        #diag.add(Bi, 1)
-
-        #woodbury_inv = backsub_both_sides(Lm, Bi)
-
        post = Posterior(woodbury_inv=woodbury_inv_all, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)

        return post, log_marginal, grad_dict
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -22,21 +22,21 @@ class VarDTC_minibatch(LatentFunctionInference):
    """
    const_jitter = 1e-6
    def __init__(self, batchsize, limit=1):
-        
+
        self.batchsize = batchsize
-        
+
        # Cache functions
        from ...util.caching import Cacher
        self.get_trYYT = Cacher(self._get_trYYT, limit)
        self.get_YYTfactor = Cacher(self._get_YYTfactor, limit)
-        
+
        self.midRes = {}
        self.batch_pos = 0 # the starting position of the current mini-batch

    def set_limit(self, limit):
        self.get_trYYT.limit = limit
        self.get_YYTfactor.limit = limit
-        
+
    def _get_trYYT(self, Y):
        return param_to_array(np.sum(np.square(Y)))

@ -51,23 +51,23 @@ class VarDTC_minibatch(LatentFunctionInference):
            return param_to_array(Y)
        else:
            return jitchol(tdot(Y))
-        
+
    def inference_likelihood(self, kern, X, Z, likelihood, Y):
        """
        The first phase of inference:
        Compute: log-likelihood, dL_dKmm
-        
+
        Cached intermediate results: Kmm, KmmInv,
        """
-        
-        num_inducing = Z.shape[0]        
+
+        num_inducing = Z.shape[0]
        num_data, output_dim = Y.shape

        if isinstance(X, VariationalPosterior):
            uncertain_inputs = True
        else:
            uncertain_inputs = False
-        
+
        #see whether we've got a different noise variance for each datum
        beta = 1./np.fmax(likelihood.variance, 1e-6)
        het_noise = beta.size > 1
@ -77,19 +77,19 @@ class VarDTC_minibatch(LatentFunctionInference):
        #self.YYTfactor = beta*self.get_YYTfactor(Y)
        YYT_factor = Y
        trYYT = self.get_trYYT(Y)
-        
+
        psi2_full = np.zeros((num_inducing,num_inducing))
        psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM
        psi0_full = 0
        YRY_full = 0
-        
+
        for n_start in xrange(0,num_data,self.batchsize):
-            
+
            n_end = min(self.batchsize+n_start, num_data)
-            
+
            Y_slice = YYT_factor[n_start:n_end]
            X_slice = X[n_start:n_end]
-            
+
            if uncertain_inputs:
                psi0 = kern.psi0(Z, X_slice)
                psi1 = kern.psi1(Z, X_slice)
@ -98,7 +98,7 @@ class VarDTC_minibatch(LatentFunctionInference):
                psi0 = kern.Kdiag(X_slice)
                psi1 = kern.K(X_slice, Z)
                psi2 = None
-                
+
            if het_noise:
                beta_slice = beta[n_start:n_end]
                psi0_full += (beta_slice*psi0).sum()
@ -106,33 +106,33 @@ class VarDTC_minibatch(LatentFunctionInference):
                YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum()
            else:
                psi0_full += psi0.sum()
-                psi1Y_full += np.dot(Y_slice.T,psi1) # DxM                
-                
+                psi1Y_full += np.dot(Y_slice.T,psi1) # DxM
+
            if uncertain_inputs:
                if het_noise:
                    psi2_full += beta_slice*psi2
                else:
-                    psi2_full += psi2
+                    psi2_full += psi2.sum(0)
            else:
                if het_noise:
                    psi2_full += beta_slice*np.outer(psi1,psi1)
                else:
-                    psi2_full += np.outer(psi1,psi1)
-                
+                    psi2_full += np.einsum('nm,jk->mk',psi1,psi1)
+
        if not het_noise:
            psi0_full *= beta
            psi1Y_full *= beta
            psi2_full *= beta
            YRY_full = trYYT*beta
-            
+
        #======================================================================
        # Compute Common Components
        #======================================================================
-        
+        self.psi1Y = psi1Y_full
        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        Lm = jitchol(Kmm)
-                
+
        Lambda = Kmm+psi2_full
        LL = jitchol(Lambda)
        b,_ = dtrtrs(LL, psi1Y_full.T)
@ -140,18 +140,18 @@ class VarDTC_minibatch(LatentFunctionInference):
        v,_ = dtrtrs(LL.T,b,lower=False)
        vvt = np.einsum('md,od->mo',v,v)
        LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
-        
+
        Psi2LLInvT = dtrtrs(LL,psi2_full)[0].T
        LmInvPsi2LLInvT= dtrtrs(Lm,Psi2LLInvT)[0]
        KmmInvPsi2LLInvT = dtrtrs(Lm,LmInvPsi2LLInvT,trans=True)[0]
        KmmInvPsi2P = dtrtrs(LL,KmmInvPsi2LLInvT.T, trans=True)[0].T
-        
+
        dL_dpsi2R = (output_dim*KmmInvPsi2P - vvt)/2. # dL_dpsi2 with R inside psi2
-        
+
        # Cache intermediate results
        self.midRes['dL_dpsi2R'] = dL_dpsi2R
        self.midRes['v'] = v
-                
+
        #======================================================================
        # Compute log-likelihood
        #======================================================================
@ -159,30 +159,33 @@ class VarDTC_minibatch(LatentFunctionInference):
            logL_R = -np.log(beta).sum()
        else:
            logL_R = -num_data*np.log(beta)
-        logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.-output_dim*(-np.log(np.diag(Lm)).sum()+np.log(np.diag(LL)).sum())
+        logL = (
+                -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.
+                -output_dim*(-np.log(np.diag(Lm)).sum()+np.log(np.diag(LL)).sum())
+                )

        #======================================================================
        # Compute dL_dKmm
        #======================================================================
-        
+
        dL_dKmm =  -(output_dim*np.einsum('md,od->mo',KmmInvPsi2LLInvT,KmmInvPsi2LLInvT) + vvt)/2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================
-        
+
 #         phi_u_mean = np.dot(Kmm,v)
 #         LLInvKmm,_ = dtrtrs(LL,Kmm)
 # #        phi_u_var = np.einsum('ma,mb->ab',LLInvKmm,LLInvKmm)
 #         phi_u_var = Kmm - np.dot(LLInvKmm.T,LLInvKmm)
-        
+
        post = Posterior(woodbury_inv=KmmInvPsi2P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm)

        return logL, dL_dKmm, post

    def inference_minibatch(self, kern, X, Z, likelihood, Y):
        """
-        The second phase of inference: Computing the derivatives over a minibatch of Y 
+        The second phase of inference: Computing the derivatives over a minibatch of Y
        Compute: dL_dpsi0, dL_dpsi1, dL_dpsi2, dL_dthetaL
        return a flag showing whether it reached the end of Y (isEnd)
        """
@ -193,14 +196,14 @@ class VarDTC_minibatch(LatentFunctionInference):
            uncertain_inputs = True
        else:
            uncertain_inputs = False
-        
+
        #see whether we've got a different noise variance for each datum
        beta = 1./np.fmax(likelihood.variance, 1e-6)
        het_noise = beta.size > 1
        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
        #self.YYTfactor = beta*self.get_YYTfactor(Y)
        YYT_factor = Y
-        
+
        n_start = self.batch_pos
        n_end = min(self.batchsize+n_start, num_data)
        if n_end==num_data:
@ -209,11 +212,11 @@ class VarDTC_minibatch(LatentFunctionInference):
        else:
            isEnd = False
            self.batch_pos = n_end
-        
+
        num_slice = n_end-n_start
        Y_slice = YYT_factor[n_start:n_end]
        X_slice = X[n_start:n_end]
-        
+
        if uncertain_inputs:
            psi0 = kern.psi0(Z, X_slice)
            psi1 = kern.psi1(Z, X_slice)
@ -222,51 +225,51 @@ class VarDTC_minibatch(LatentFunctionInference):
            psi0 = kern.Kdiag(X_slice)
            psi1 = kern.K(X_slice, Z)
            psi2 = None
-            
+
        if het_noise:
            beta = beta[n_start] # assuming batchsize==1

        betaY = beta*Y_slice
        betapsi1 = np.einsum('n,nm->nm',beta,psi1)
-        
+
        #======================================================================
        # Load Intermediate Results
        #======================================================================
-        
+
        dL_dpsi2R = self.midRes['dL_dpsi2R']
        v = self.midRes['v']

        #======================================================================
        # Compute dL_dpsi
        #======================================================================
-        
+
        dL_dpsi0 = -0.5 * output_dim * (beta * np.ones((n_end-n_start,)))
-        
+
        dL_dpsi1 = np.dot(betaY,v.T)
-        
+
        if uncertain_inputs:
            dL_dpsi2 = beta* dL_dpsi2R
        else:
            dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2.
            dL_dpsi2 = None
-            
+
        #======================================================================
        # Compute dL_dthetaL
        #======================================================================

        if het_noise:
            if uncertain_inputs:
-                psiR = np.einsum('mo,nmo->n',dL_dpsi2R,psi2)
+                psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2)
            else:
-                psiR = np.einsum('nm,no,mo->n',psi1,psi1,dL_dpsi2R)
-            
+                psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R)
+
            dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1)
        else:
            if uncertain_inputs:
-                psiR = np.einsum('mo,mo->',dL_dpsi2R,psi2)
+                psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2)
            else:
                psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R)
-            
+
            dL_dthetaL = ((np.square(betaY)).sum() + beta*beta*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - beta*beta*psiR- (betaY*np.dot(betapsi1,v)).sum()

        if uncertain_inputs:
@ -278,15 +281,15 @@ class VarDTC_minibatch(LatentFunctionInference):
            grad_dict = {'dL_dKdiag':dL_dpsi0,
                         'dL_dKnm':dL_dpsi1,
                         'dL_dthetaL':dL_dthetaL}
-            
+
        return isEnd, (n_start,n_end), grad_dict


 def update_gradients(model):
    model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, model.X, model.Z, model.likelihood, model.Y)
-    
+
    het_noise = model.likelihood.variance.size > 1
-    
+
    if het_noise:
        dL_dthetaL = np.empty((model.Y.shape[0],))
    else:
@ -295,40 +298,54 @@ def update_gradients(model):
    #gradients w.r.t. kernel
    model.kern.update_gradients_full(dL_dKmm, model.Z, None)
    kern_grad = model.kern.gradient.copy()
-            
+
    #gradients w.r.t. Z
    model.Z.gradient = model.kern.gradients_X(dL_dKmm, model.Z)
-    
+
    isEnd = False
    while not isEnd:
        isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, model.X, model.Z, model.likelihood, model.Y)
        if isinstance(model.X, VariationalPosterior):
            X_slice = model.X[n_range[0]:n_range[1]]
-            
+
+            dL_dpsi1 = grad_dict['dL_dpsi1']#[None, :]
+            dL_dpsi2 = grad_dict['dL_dpsi2'][None, :, :]
            #gradients w.r.t. kernel
-            model.kern.update_gradients_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
+            model.kern.update_gradients_expectations(variational_posterior=X_slice,Z=model.Z,dL_dpsi0=grad_dict['dL_dpsi0'],dL_dpsi1=dL_dpsi1,dL_dpsi2=dL_dpsi2)
            kern_grad += model.kern.gradient
-    
+
            #gradients w.r.t. Z
            model.Z.gradient += model.kern.gradients_Z_expectations(
-                               dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice)
-        
+                                    dL_dpsi0=grad_dict['dL_dpsi0'],
+                                    dL_dpsi1=dL_dpsi1,
+                                    dL_dpsi2=dL_dpsi2,
+                                    Z=model.Z, variational_posterior=X_slice)
+
            #gradients w.r.t. posterior parameters of X
-            X_grad = model.kern.gradients_qX_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
-            model.set_X_gradients(X_slice, X_grad)
-                
+            X_grad = model.kern.gradients_qX_expectations(
+                                    variational_posterior=X_slice,
+                                    Z=model.Z,
+                                    dL_dpsi0=grad_dict['dL_dpsi0'],
+                                    dL_dpsi1=dL_dpsi1,
+                                    dL_dpsi2=dL_dpsi2)
+
+            model.X.mean[n_range[0]:n_range[1]].gradient = X_grad[0]
+            model.X.variance[n_range[0]:n_range[1]].gradient = X_grad[1]
+
            if het_noise:
                dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL']
            else:
                dL_dthetaL += grad_dict['dL_dthetaL']
-    
+    #import ipdb;ipdb.set_trace()
+    model.grad_dict = grad_dict
+    if isinstance(model.X, VariationalPosterior):
+        # Update Log-likelihood
+        model._log_marginal_likelihood -= model.variational_prior.KL_divergence(model.X)
+        # update for the KL divergence
+        model.variational_prior.update_gradients_KL(model.X)
+
    # Set the gradients w.r.t. kernel
    model.kern.gradient = kern_grad

-    # Update Log-likelihood
-    model._log_marginal_likelihood -= model.variational_prior.KL_divergence(model.X)
-    # update for the KL divergence
-    model.variational_prior.update_gradients_KL(model.X)
-    
    # dL_dthetaL
    model.likelihood.update_gradients(dL_dthetaL)
--- a/GPy/inference/optimization/scg.py
+++ b/GPy/inference/optimization/scg.py
@ -56,13 +56,13 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
    if gtol is None:
        gtol = 1e-5

-    sigma0 = 1.0e-8
+    sigma0 = 1.0e-7
    fold = f(x, *optargs) # Initial function value.
    function_eval = 1
    fnow = fold
    gradnew = gradf(x, *optargs) # Initial gradient.
-    if any(np.isnan(gradnew)):
-        raise UnexpectedInfOrNan, "Gradient contribution resulted in a NaN value"
+    #if any(np.isnan(gradnew)):
+    #    raise UnexpectedInfOrNan, "Gradient contribution resulted in a NaN value"
    current_grad = np.dot(gradnew, gradnew)
    gradold = gradnew.copy()
    d = -gradnew # Initial search direction.
@ -168,13 +168,13 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
        if Delta < 0.25:
            beta = min(4.0 * beta, betamax)
        if Delta > 0.75:
-            beta = max(0.5 * beta, betamin)
+            beta = max(0.25 * beta, betamin)

        # Update search direction using Polak-Ribiere formula, or re-start
        # in direction of negative gradient after nparams steps.
        if nsuccess == x.size:
            d = -gradnew
-#             beta = 1.  # TODO: betareset!!
+            beta = 1. # This is not in the original paper
            nsuccess = 0
        elif success:
            Gamma = np.dot(gradold - gradnew, gradnew) / (mu)