[optimization] deleted and backwardscompatible

2026-05-24 14:15:14 +02:00 · 2015-11-09 13:27:24 +00:00 · 2015-11-09 13:27:24 +00:00 · 913d80f712
commit 913d80f712
parent 850c10beaa
8 changed files with 11 additions and 914 deletions
--- a/GPy/init.py
+++ b/GPy/init.py
@ -20,7 +20,6 @@ backwards_compatibility = ['lists_and_dicts', 'observable_array', 'ties_and_rema
 for bc in backwards_compatibility:
    sys.modules['GPy.core.parameterization.{!s}'.format(bc)] = getattr(core.parameterization, bc)
 # Direct imports for convenience:
 from .core import Model
 from .core.parameterization import priors
@ -52,5 +51,6 @@ def load(file_or_path):
    for name, module in inspect.getmembers(kern.src):  # @UndefinedVariable
        if not name.startswith('_'):
            sys.modules['GPy.kern._src.{}'.format(name)] = module
    sys.modules['GPy.inference.optimization'] = inference.optimization
    import paramz
    return paramz.load(file_or_path)
--- a/GPy/inference/init.py
+++ b/GPy/inference/init.py
@ -1,3 +1,7 @@
 from . import latent_function_inference
 from . import optimization
 from . import latent_function_inference
 from . import mcmc
 import sys
 sys.modules['GPy.inference.optimization'] = optimization
 sys.modules['GPy.inference.optimization.optimization'] = optimization
--- a/GPy/inference/optimization/init.py
+++ b/GPy/inference/optimization/init.py
@ -1,2 +1,5 @@
-from .scg import SCG
+from paramz.optimization import stochastics, Optimizer
-from .optimization import *
+from paramz.optimization import *
 import sys
 sys.modules['GPy.inference.optimization.stochastics'] = stochastics
 sys.modules['GPy.inference.optimization.Optimizer'] = Optimizer
--- a/GPy/inference/optimization/conjugate_gradient_descent.py
+++ b/GPy/inference/optimization/conjugate_gradient_descent.py
@ -1,285 +0,0 @@
 # Copyright (c) 2012-2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from .gradient_descent_update_rules import FletcherReeves, \
    PolakRibiere
 from Queue import Empty
 from multiprocessing import Value
 from multiprocessing.queues import Queue
 from multiprocessing.synchronize import Event
 from scipy.optimize.linesearch import line_search_wolfe1, line_search_wolfe2
 from threading import Thread
 import numpy
 import sys
 import time
 RUNNING = "running"
 CONVERGED = "converged"
 MAXITER = "maximum number of iterations reached"
 MAX_F_EVAL = "maximum number of function calls reached"
 LINE_SEARCH = "line search failed"
 KBINTERRUPT = "interrupted"
 class _Async_Optimization(Thread):
    def __init__(self, f, df, x0, update_rule, runsignal, SENTINEL,
                 report_every=10, messages=0, maxiter=5e3, max_f_eval=15e3,
                 gtol=1e-6, outqueue=None, *args, **kw):
        """
        Helper Process class for async optimization
        f_call and df_call are Multiprocessing Values, for synchronized assignment
        """
        self.f_call = Value('i', 0)
        self.df_call = Value('i', 0)
        self.f = self.f_wrapper(f, self.f_call)
        self.df = self.f_wrapper(df, self.df_call)
        self.x0 = x0
        self.update_rule = update_rule
        self.report_every = report_every
        self.messages = messages
        self.maxiter = maxiter
        self.max_f_eval = max_f_eval
        self.gtol = gtol
        self.SENTINEL = SENTINEL
        self.runsignal = runsignal
 #         self.parent = parent
 #         self.result = None
        self.outq = outqueue
        super(_Async_Optimization, self).__init__(target=self.run,
                                            name="CG Optimization",
                                            *args, **kw)
 #     def __enter__(self):
 #         return self
 #
 #     def __exit__(self, type, value, traceback):
 #         return isinstance(value, TypeError)
    def f_wrapper(self, f, counter):
        def f_w(*a, **kw):
            counter.value += 1
            return f(*a, **kw)
        return f_w
    def callback(self, *a):
        if self.outq is not None:
            self.outq.put(a)
 #         self.parent and self.parent.callback(*a, **kw)
        pass
        # print "callback done"
    def callback_return(self, *a):
        self.callback(*a)
        if self.outq is not None:
            self.outq.put(self.SENTINEL)
        if self.messages:
            print("")
        self.runsignal.clear()
    def run(self, *args, **kwargs):
        raise NotImplementedError("Overwrite this with optimization (for async use)")
        pass
 class _CGDAsync(_Async_Optimization):
    def reset(self, xi, *a, **kw):
        gi = -self.df(xi, *a, **kw)
        si = gi
        ur = self.update_rule(gi)
        return gi, ur, si
    def run(self, *a, **kw):
        status = RUNNING
        fi = self.f(self.x0)
        fi_old = fi + 5000
        gi, ur, si = self.reset(self.x0, *a, **kw)
        xi = self.x0
        xi_old = numpy.nan
        it = 0
        while it < self.maxiter:
            if not self.runsignal.is_set():
                break
            if self.f_call.value > self.max_f_eval:
                status = MAX_F_EVAL
            gi = -self.df(xi, *a, **kw)
            if numpy.dot(gi.T, gi) <= self.gtol:
                status = CONVERGED
                break
            if numpy.isnan(numpy.dot(gi.T, gi)):
                if numpy.any(numpy.isnan(xi_old)):
                    status = CONVERGED
                    break
                self.reset(xi_old)
            gammai = ur(gi)
            if gammai < 1e-6 or it % xi.shape[0] == 0:
                gi, ur, si = self.reset(xi, *a, **kw)
            si = gi + gammai * si
            alphai, _, _, fi2, fi_old2, gfi = line_search_wolfe1(self.f,
                                                                 self.df,
                                                                 xi,
                                                                 si, gi,
                                                                 fi, fi_old)
            if alphai is None:
                alphai, _, _, fi2, fi_old2, gfi = \
                         line_search_wolfe2(self.f, self.df,
                                            xi, si, gi,
                                            fi, fi_old)
                if alphai is None:
                    # This line search also failed to find a better solution.
                    status = LINE_SEARCH
                    break
            if fi2 < fi:
                fi, fi_old = fi2, fi_old2
            if gfi is not None:
                gi = gfi
            if numpy.isnan(fi) or fi_old < fi:
                gi, ur, si = self.reset(xi, *a, **kw)
            else:
                xi += numpy.dot(alphai, si)
                if self.messages:
                    sys.stdout.write("\r")
                    sys.stdout.flush()
                    sys.stdout.write("iteration: {0:> 6g}  f:{1:> 12e}  |g|:{2:> 12e}".format(it, fi, numpy.dot(gi.T, gi)))
            if it % self.report_every == 0:
                self.callback(xi, fi, gi, it, self.f_call.value, self.df_call.value, status)
            it += 1
        else:
            status = MAXITER
        self.callback_return(xi, fi, gi, it, self.f_call.value, self.df_call.value, status)
        self.result = [xi, fi, gi, it, self.f_call.value, self.df_call.value, status]
 class Async_Optimize(object):
    callback = lambda *x: None
    runsignal = Event()
    SENTINEL = "SENTINEL"
    def async_callback_collect(self, q):
        while self.runsignal.is_set():
            try:
                for ret in iter(lambda: q.get(timeout=1), self.SENTINEL):
                    self.callback(*ret)
                self.runsignal.clear()
            except Empty:
                pass
    def opt_async(self, f, df, x0, callback, update_rule=PolakRibiere,
                   messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6,
                   report_every=10, *args, **kwargs):
        self.runsignal.set()
        c = None
        outqueue = None
        if callback:
            outqueue = Queue()
            self.callback = callback
            c = Thread(target=self.async_callback_collect, args=(outqueue,))
            c.start()
        p = _CGDAsync(f, df, x0, update_rule, self.runsignal, self.SENTINEL,
                 report_every=report_every, messages=messages, maxiter=maxiter,
                 max_f_eval=max_f_eval, gtol=gtol, outqueue=outqueue, *args, **kwargs)
        p.start()
        return p, c
    def opt(self, f, df, x0, callback=None, update_rule=FletcherReeves,
                   messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6,
                   report_every=10, *args, **kwargs):
        p, c = self.opt_async(f, df, x0, callback, update_rule, messages,
                            maxiter, max_f_eval, gtol,
                            report_every, *args, **kwargs)
        while self.runsignal.is_set():
            try:
                p.join(1)
                if c: c.join(1)
            except KeyboardInterrupt:
                # print "^C"
                self.runsignal.clear()
                p.join()
                if c: c.join()
        if c and c.is_alive():
 #             self.runsignal.set()
 #             while self.runsignal.is_set():
 #                 try:
 #                     c.join(.1)
 #                 except KeyboardInterrupt:
 #                     # print "^C"
 #                     self.runsignal.clear()
 #                     c.join()
            print("WARNING: callback still running, optimisation done!")
        return p.result
 class CGD(Async_Optimize):
    '''
    Conjugate gradient descent algorithm to minimize
    function f with gradients df, starting at x0
    with update rule update_rule
    if df returns tuple (grad, natgrad) it will optimize according 
    to natural gradient rules
    '''
    opt_name = "Conjugate Gradient Descent"
    def opt_async(self, *a, **kw):
        """
        opt_async(self, f, df, x0, callback, update_rule=FletcherReeves,
               messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6,
               report_every=10, \*args, \*\*kwargs)
        callback gets called every `report_every` iterations
            callback(xi, fi, gi, iteration, function_calls, gradient_calls, status_message)
        if df returns tuple (grad, natgrad) it will optimize according 
        to natural gradient rules
        f, and df will be called with
            f(xi, \*args, \*\*kwargs)
            df(xi, \*args, \*\*kwargs)
        **Returns:**
            Started `Process` object, optimizing asynchronously 
        **Calls:** 
            callback(x_opt, f_opt, g_opt, iteration, function_calls, gradient_calls, status_message)
        at end of optimization!
        """
        return super(CGD, self).opt_async(*a, **kw)
    def opt(self, *a, **kw):
        """
        opt(self, f, df, x0, callback=None, update_rule=FletcherReeves,
               messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6,
               report_every=10, \*args, \*\*kwargs)
        Minimize f, calling callback every `report_every` iterations with following syntax:
            callback(xi, fi, gi, iteration, function_calls, gradient_calls, status_message)
        if df returns tuple (grad, natgrad) it will optimize according 
        to natural gradient rules
        f, and df will be called with
            f(xi, \*args, \*\*kwargs)
            df(xi, \*args, \*\*kwargs)
        **returns** 
            x_opt, f_opt, g_opt, iteration, function_calls, gradient_calls, status_message
        at end of optimization
        """
        return super(CGD, self).opt(*a, **kw)
--- a/GPy/inference/optimization/gradient_descent_update_rules.py
+++ b/GPy/inference/optimization/gradient_descent_update_rules.py
@ -1,51 +0,0 @@
 # Copyright (c) 2012-2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy
 class GDUpdateRule():
    _gradnat = None
    _gradnatold = None
    def __init__(self, initgrad, initgradnat=None):
        self.grad = initgrad
        if initgradnat:
            self.gradnat = initgradnat
        else:
            self.gradnat = initgrad
        # self.grad, self.gradnat
    def _gamma(self):
        raise NotImplemented("""Implement gamma update rule here, 
        you can use self.grad and self.gradold for parameters, as well as
        self.gradnat and self.gradnatold for natural gradients.""")
    def __call__(self, grad, gradnat=None, si=None, *args, **kw):
        """
        Return gamma for given gradients and optional natural gradients
        """
        if not gradnat:
            gradnat = grad
        self.gradold = self.grad
        self.gradnatold = self.gradnat
        self.grad = grad
        self.gradnat = gradnat
        self.si = si
        return self._gamma(*args, **kw)
 class FletcherReeves(GDUpdateRule):
    '''
    Fletcher Reeves update rule for gamma
    '''
    def _gamma(self, *a, **kw):
        tmp = numpy.dot(self.grad.T, self.gradnat)
        if tmp:
            return tmp / numpy.dot(self.gradold.T, self.gradnatold)
        return tmp
 class PolakRibiere(GDUpdateRule):
    '''
    Fletcher Reeves update rule for gamma
    '''
    def _gamma(self, *a, **kw):
        tmp = numpy.dot((self.grad - self.gradold).T, self.gradnat)
        if tmp:
            return tmp / numpy.dot(self.gradold.T, self.gradnatold)
        return tmp
--- a/GPy/inference/optimization/optimization.py
+++ b/GPy/inference/optimization/optimization.py
@ -1,289 +0,0 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import datetime as dt
 from scipy import optimize
 from warnings import warn
 try:
    import rasmussens_minimize as rasm
    rasm_available = True
 except ImportError:
    rasm_available = False
 from .scg import SCG
 class Optimizer(object):
    """
    Superclass for all the optimizers.
    :param x_init: initial set of parameters
    :param f_fp: function that returns the function AND the gradients at the same time
    :param f: function to optimize
    :param fp: gradients
    :param messages: print messages from the optimizer?
    :type messages: (True | False)
    :param max_f_eval: maximum number of function evaluations
    :rtype: optimizer object.
    """
    def __init__(self, x_init=None, messages=False, max_f_eval=1e4, max_iters=1e3,
                 ftol=None, gtol=None, xtol=None, bfgs_factor=None):
        self.opt_name = None
        #x_init = x_init
        # Turning messages off and using internal structure for print outs:
        self.messages = False #messages
        self.f_opt = None
        self.x_opt = None
        self.funct_eval = None
        self.status = None
        self.max_f_eval = int(max_iters)
        self.max_iters = int(max_iters)
        self.bfgs_factor = bfgs_factor
        self.trace = None
        self.time = "Not available"
        self.xtol = xtol
        self.gtol = gtol
        self.ftol = ftol
    def run(self, **kwargs):
        start = dt.datetime.now()
        self.opt(**kwargs)
        end = dt.datetime.now()
        self.time = str(end - start)
    def opt(self, x_init, f_fp=None, f=None, fp=None):
        raise NotImplementedError("this needs to be implemented to use the optimizer class")
    def __str__(self):
        diagnostics = "Optimizer: \t\t\t\t %s\n" % self.opt_name
        diagnostics += "f(x_opt): \t\t\t\t %.3f\n" % self.f_opt
        diagnostics += "Number of function evaluations: \t %d\n" % self.funct_eval
        diagnostics += "Optimization status: \t\t\t %s\n" % self.status
        diagnostics += "Time elapsed: \t\t\t\t %s\n" % self.time
        return diagnostics
    def __getstate__(self):
        return []
 class opt_tnc(Optimizer):
    def __init__(self, *args, **kwargs):
        Optimizer.__init__(self, *args, **kwargs)
        self.opt_name = "TNC (Scipy implementation)"
    def opt(self, x_init, f_fp=None, f=None, fp=None):
        """
        Run the TNC optimizer
        """
        tnc_rcstrings = ['Local minimum', 'Converged', 'XConverged', 'Maximum number of f evaluations reached',
             'Line search failed', 'Function is constant']
        assert f_fp != None, "TNC requires f_fp"
        opt_dict = {}
        if self.xtol is not None:
            opt_dict['xtol'] = self.xtol
        if self.ftol is not None:
            opt_dict['ftol'] = self.ftol
        if self.gtol is not None:
            opt_dict['pgtol'] = self.gtol
        opt_result = optimize.fmin_tnc(f_fp, x_init, messages=self.messages,
                       maxfun=self.max_f_eval, **opt_dict)
        self.x_opt = opt_result[0]
        self.f_opt = f_fp(self.x_opt)[0]
        self.funct_eval = opt_result[1]
        self.status = tnc_rcstrings[opt_result[2]]
 class opt_lbfgsb(Optimizer):
    def __init__(self, *args, **kwargs):
        Optimizer.__init__(self, *args, **kwargs)
        self.opt_name = "L-BFGS-B (Scipy implementation)"
    def opt(self, x_init, f_fp=None, f=None, fp=None):
        """
        Run the optimizer
        """
        rcstrings = ['Converged', 'Maximum number of f evaluations reached', 'Error']
        assert f_fp != None, "BFGS requires f_fp"
        if self.messages:
            iprint = 1
        else:
            iprint = -1
        opt_dict = {}
        if self.xtol is not None:
            print("WARNING: l-bfgs-b doesn't have an xtol arg, so I'm going to ignore it")
        if self.ftol is not None:
            print("WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it")
        if self.gtol is not None:
            opt_dict['pgtol'] = self.gtol
        if self.bfgs_factor is not None:
            opt_dict['factr'] = self.bfgs_factor
        opt_result = optimize.fmin_l_bfgs_b(f_fp, x_init, iprint=iprint,
                                            maxfun=self.max_iters, **opt_dict)
        self.x_opt = opt_result[0]
        self.f_opt = f_fp(self.x_opt)[0]
        self.funct_eval = opt_result[2]['funcalls']
        self.status = rcstrings[opt_result[2]['warnflag']]
        #a more helpful error message is available in opt_result in the Error case
        if opt_result[2]['warnflag']==2:
            self.status = 'Error' + str(opt_result[2]['task'])
 class opt_bfgs(Optimizer):
    def __init__(self, *args, **kwargs):
        Optimizer.__init__(self, *args, **kwargs)
        self.opt_name = "BFGS (Scipy implementation)"
    def opt(self, x_init, f_fp=None, f=None, fp=None):
        """
        Run the optimizer
        """
        rcstrings = ['','Maximum number of iterations exceeded', 'Gradient and/or function calls not changing']
        opt_dict = {}
        if self.xtol is not None:
            print("WARNING: bfgs doesn't have an xtol arg, so I'm going to ignore it")
        if self.ftol is not None:
            print("WARNING: bfgs doesn't have an ftol arg, so I'm going to ignore it")
        if self.gtol is not None:
            opt_dict['pgtol'] = self.gtol
        opt_result = optimize.fmin_bfgs(f, x_init, fp, disp=self.messages,
                                            maxiter=self.max_iters, full_output=True, **opt_dict)
        self.x_opt = opt_result[0]
        self.f_opt = f_fp(self.x_opt)[0]
        self.funct_eval = opt_result[4]
        self.status = rcstrings[opt_result[6]]
 class opt_simplex(Optimizer):
    def __init__(self, *args, **kwargs):
        Optimizer.__init__(self, *args, **kwargs)
        self.opt_name = "Nelder-Mead simplex routine (via Scipy)"
    def opt(self, x_init, f_fp=None, f=None, fp=None):
        """
        The simplex optimizer does not require gradients.
        """
        statuses = ['Converged', 'Maximum number of function evaluations made', 'Maximum number of iterations reached']
        opt_dict = {}
        if self.xtol is not None:
            opt_dict['xtol'] = self.xtol
        if self.ftol is not None:
            opt_dict['ftol'] = self.ftol
        if self.gtol is not None:
            print("WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it")
        opt_result = optimize.fmin(f, x_init, (), disp=self.messages,
                   maxfun=self.max_f_eval, full_output=True, **opt_dict)
        self.x_opt = opt_result[0]
        self.f_opt = opt_result[1]
        self.funct_eval = opt_result[3]
        self.status = statuses[opt_result[4]]
        self.trace = None
 class opt_rasm(Optimizer):
    def __init__(self, *args, **kwargs):
        Optimizer.__init__(self, *args, **kwargs)
        self.opt_name = "Rasmussen's Conjugate Gradient"
    def opt(self, x_init, f_fp=None, f=None, fp=None):
        """
        Run Rasmussen's Conjugate Gradient optimizer
        """
        assert f_fp != None, "Rasmussen's minimizer requires f_fp"
        statuses = ['Converged', 'Line search failed', 'Maximum number of f evaluations reached',
                'NaNs in optimization']
        opt_dict = {}
        if self.xtol is not None:
            print("WARNING: minimize doesn't have an xtol arg, so I'm going to ignore it")
        if self.ftol is not None:
            print("WARNING: minimize doesn't have an ftol arg, so I'm going to ignore it")
        if self.gtol is not None:
            print("WARNING: minimize doesn't have an gtol arg, so I'm going to ignore it")
        opt_result = rasm.minimize(x_init, f_fp, (), messages=self.messages,
                                   maxnumfuneval=self.max_f_eval)
        self.x_opt = opt_result[0]
        self.f_opt = opt_result[1][-1]
        self.funct_eval = opt_result[2]
        self.status = statuses[opt_result[3]]
        self.trace = opt_result[1]
 class opt_SCG(Optimizer):
    def __init__(self, *args, **kwargs):
        if 'max_f_eval' in kwargs:
            warn("max_f_eval deprecated for SCG optimizer: use max_iters instead!\nIgnoring max_f_eval!", FutureWarning)
        Optimizer.__init__(self, *args, **kwargs)
        self.opt_name = "Scaled Conjugate Gradients"
    def opt(self, x_init, f_fp=None, f=None, fp=None):
        assert not f is None
        assert not fp is None
        opt_result = SCG(f, fp, x_init, display=self.messages,
                         maxiters=self.max_iters,
                         max_f_eval=self.max_f_eval,
                         xtol=self.xtol, ftol=self.ftol,
                         gtol=self.gtol)
        self.x_opt = opt_result[0]
        self.trace = opt_result[1]
        self.f_opt = self.trace[-1]
        self.funct_eval = opt_result[2]
        self.status = opt_result[3]
 class Opt_Adadelta(Optimizer):
    def __init__(self, step_rate=0.1, decay=0.9, momentum=0, *args, **kwargs):
        Optimizer.__init__(self, *args, **kwargs)
        self.opt_name = "Adadelta (climin)"
        self.step_rate=step_rate
        self.decay = decay
        self.momentum = momentum
    def opt(self, x_init, f_fp=None, f=None, fp=None):
        assert not fp is None
        import climin
        opt = climin.adadelta.Adadelta(x_init, fp, step_rate=self.step_rate, decay=self.decay, momentum=self.momentum)
        for info in opt:
            if info['n_iter']>=self.max_iters:
                self.x_opt =  opt.wrt
                self.status = 'maximum number of function evaluations exceeded '
                break
 def get_optimizer(f_min):
    optimizers = {'fmin_tnc': opt_tnc,
          'simplex': opt_simplex,
          'lbfgsb': opt_lbfgsb,
          'org-bfgs': opt_bfgs,
          'scg': opt_SCG,
          'adadelta':Opt_Adadelta}
    if rasm_available:
        optimizers['rasmussen'] = opt_rasm
    for opt_name in optimizers.keys():
        if opt_name.lower().find(f_min.lower()) != -1:
            return optimizers[opt_name]
    raise KeyError('No optimizer was found matching the name: %s' % f_min)
--- a/GPy/inference/optimization/scg.py
+++ b/GPy/inference/optimization/scg.py
@ -1,193 +0,0 @@
 # Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2014)
 # Scaled Conjuagte Gradients, originally in Matlab as part of the Netlab toolbox by I. Nabney, converted to python N. Lawrence and given a pythonic interface by James Hensman
 #      THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
 #      HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
 #      EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
 #      NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 #      MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 #      PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 #      REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
 #      DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 #      EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 #      (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
 #      OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 #      DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 #      HOWEVER CAUSED AND ON ANY THEORY OF
 #      LIABILITY, WHETHER IN CONTRACT, STRICT
 #      LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 #      OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 #      OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #      POSSIBILITY OF SUCH DAMAGE.
 from __future__ import print_function
 import numpy as np
 import sys
 def print_out(len_maxiters, fnow, current_grad, beta, iteration):
    print('\r', end=' ')
    print('{0:>0{mi}g}  {1:> 12e}  {2:< 12.6e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), end=' ') # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
    sys.stdout.flush()
 def exponents(fnow, current_grad):
    exps = [np.abs(np.float(fnow)), current_grad]
    return np.sign(exps) * np.log10(exps).astype(int)
 def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True, xtol=None, ftol=None, gtol=None):
    """
    Optimisation through Scaled Conjugate Gradients (SCG)
    f: the objective function
    gradf : the gradient function (should return a 1D np.ndarray)
    x : the initial condition
    Returns
    x the optimal value for x
    flog : a list of all the objective values
    function_eval number of fn evaluations
    status: string describing convergence status
    """
    if xtol is None:
        xtol = 1e-6
    if ftol is None:
        ftol = 1e-6
    if gtol is None:
        gtol = 1e-5
    sigma0 = 1.0e-7
    fold = f(x, *optargs) # Initial function value.
    function_eval = 1
    fnow = fold
    gradnew = gradf(x, *optargs) # Initial gradient.
    function_eval += 1
    #if any(np.isnan(gradnew)):
    #    raise UnexpectedInfOrNan, "Gradient contribution resulted in a NaN value"
    current_grad = np.dot(gradnew, gradnew)
    gradold = gradnew.copy()
    d = -gradnew # Initial search direction.
    success = True # Force calculation of directional derivs.
    nsuccess = 0 # nsuccess counts number of successes.
    beta = 1.0 # Initial scale parameter.
    betamin = 1.0e-15 # Lower bound on scale.
    betamax = 1.0e15 # Upper bound on scale.
    status = "Not converged"
    flog = [fold]
    iteration = 0
    len_maxiters = len(str(maxiters))
    if display:
        print(' {0:{mi}s}   {1:11s}    {2:11s}    {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters))
        exps = exponents(fnow, current_grad)
        p_iter = iteration
    # Main optimization loop.
    while iteration < maxiters:
        # Calculate first and second directional derivatives.
        if success:
            mu = np.dot(d, gradnew)
            if mu >= 0:
                d = -gradnew
                mu = np.dot(d, gradnew)
            kappa = np.dot(d, d)
            sigma = sigma0 / np.sqrt(kappa)
            xplus = x + sigma * d
            gplus = gradf(xplus, *optargs)
            function_eval += 1
            theta = np.dot(d, (gplus - gradnew)) / sigma
        # Increase effective curvature and evaluate step size alpha.
        delta = theta + beta * kappa
        if delta <= 0:
            delta = beta * kappa
            beta = beta - theta / kappa
        alpha = -mu / delta
        # Calculate the comparison ratio.
        xnew = x + alpha * d
        fnew = f(xnew, *optargs)
        function_eval += 1
        if function_eval >= max_f_eval:
            status = "maximum number of function evaluations exceeded"
            break
            return x, flog, function_eval, status
        Delta = 2.*(fnew - fold) / (alpha * mu)
        if Delta >= 0.:
            success = True
            nsuccess += 1
            x = xnew
            fnow = fnew
        else:
            success = False
            fnow = fold
        # Store relevant variables
        flog.append(fnow) # Current function value
        iteration += 1
        if display:
            print_out(len_maxiters, fnow, current_grad, beta, iteration)
            n_exps = exponents(fnow, current_grad)
            if iteration - p_iter >= 20 * np.random.rand():
                a = iteration >= p_iter * 2.78
                b = np.any(n_exps < exps)
                if a or b:
                    p_iter = iteration
                    print('')
                if b:
                    exps = n_exps
        if success:
            # Test for termination
            if (np.abs(fnew - fold) < ftol):
                status = 'converged - relative reduction in objective'
                break
 #                 return x, flog, function_eval, status
            elif (np.max(np.abs(alpha * d)) < xtol):
                status = 'converged - relative stepsize'
                break
            else:
                # Update variables for new position
                gradold = gradnew
                gradnew = gradf(x, *optargs)
                function_eval += 1
                current_grad = np.dot(gradnew, gradnew)
                fold = fnew
                # If the gradient is zero then we are done.
                if current_grad <= gtol:
                    status = 'converged - relative reduction in gradient'
                    break
                    # return x, flog, function_eval, status
        # Adjust beta according to comparison ratio.
        if Delta < 0.25:
            beta = min(4.0 * beta, betamax)
        if Delta > 0.75:
            beta = max(0.25 * beta, betamin)
        # Update search direction using Polak-Ribiere formula, or re-start
        # in direction of negative gradient after nparams steps.
        if nsuccess == x.size:
            d = -gradnew
            beta = 1. # This is not in the original paper
            nsuccess = 0
        elif success:
            Gamma = np.dot(gradold - gradnew, gradnew) / (mu)
            d = Gamma * d - gradnew
    else:
        # If we get here, then we haven't terminated in the given number of
        # iterations.
        status = "maxiter exceeded"
    if display:
        print_out(len_maxiters, fnow, current_grad, beta, iteration)
        print("")
        print(status)
    return x, flog, function_eval, status
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@ -1,92 +0,0 @@
 # Copyright (c) 2012-2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 class StochasticStorage(object):
    '''
    This is a container for holding the stochastic parameters,
    such as subset indices or step length and so on.
    self.d has to be a list of lists:
    [dimension indices, nan indices for those dimensions]
    so that the minibatches can be used as efficiently as possible.10
    '''
    def __init__(self, model):
        """
        Initialize this stochastic container using the given model
        """
    def do_stochastics(self):
        """
        Update the internal state to the next batch of the stochastic
        descent algorithm.
        """
        pass
    def reset(self):
        """
        Reset the state of this stochastics generator.
        """
 class SparseGPMissing(StochasticStorage):
    def __init__(self, model, batchsize=1):
        """
        Here we want to loop over all dimensions everytime.
        Thus, we can just make sure the loop goes over self.d every
        time. We will try to get batches which look the same together
        which speeds up calculations significantly.
        """
        import numpy as np
        self.Y = model.Y_normalized
        bdict = {}
        #For N > 1000 array2string default crops
        opt = np.get_printoptions()
        np.set_printoptions(threshold=np.inf)
        for d in range(self.Y.shape[1]):
            inan = np.isnan(self.Y)[:, d]
            arr_str = np.array2string(inan, np.inf, 0, True, '', formatter={'bool':lambda x: '1' if x else '0'})
            try:
                bdict[arr_str][0].append(d)
            except:
                bdict[arr_str] = [[d], ~inan]
        np.set_printoptions(**opt)
        self.d = bdict.values()
 class SparseGPStochastics(StochasticStorage):
    """
    For the sparse gp we need to store the dimension we are in,
    and the indices corresponding to those
    """
    def __init__(self, model, batchsize=1, missing_data=True):
        self.batchsize = batchsize
        self.output_dim = model.Y.shape[1]
        self.Y = model.Y_normalized
        self.missing_data = missing_data
        self.reset()
        self.do_stochastics()
    def do_stochastics(self):
        import numpy as np
        if self.batchsize == 1:
            self.current_dim = (self.current_dim+1)%self.output_dim
            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.current_dim]) if self.missing_data else None]]
        else:
            self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
            bdict = {}
            if self.missing_data:
                opt = np.get_printoptions()
                np.set_printoptions(threshold=np.inf)
                for d in self.d:
                    inan = np.isnan(self.Y[:, d])
                    arr_str = np.array2string(inan,np.inf, 0,True, '',formatter={'bool':lambda x: '1' if x else '0'})
                    try:
                        bdict[arr_str][0].append(d)
                    except:
                        bdict[arr_str] = [[d], ~inan]
                np.set_printoptions(**opt)
                self.d = bdict.values()
            else:
                self.d = [[self.d, None]]
    def reset(self):
        self.current_dim = -1
        self.d = None