new termination rule for scg

This commit is contained in:
Max Zwiessele 2013-05-08 09:44:09 +01:00
parent 0e8bc5662a
commit 803a1d99ed
6 changed files with 84 additions and 50 deletions

View file

@ -207,7 +207,7 @@ def bgplvm_simulation(burnin='scg', plot_sim=False,
max_burnin=100, true_X=False, max_burnin=100, true_X=False,
do_opt=True, do_opt=True,
max_f_eval=1000): max_f_eval=1000):
D1, D2, D3, N, M, Q = 10, 8, 8, 250, 10, 6 D1, D2, D3, N, M, Q = 15, 8, 8, 350, 3, 6
slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, M, Q, plot_sim) slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, M, Q, plot_sim)
from GPy.models import mrd from GPy.models import mrd

View file

@ -1,6 +1,6 @@
#Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2012) # Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2012)
#Scaled Conjuagte Gradients, originally in Matlab as part of the Netlab toolbox by I. Nabney, converted to python N. Lawrence and given a pythonic interface by James Hensman # Scaled Conjuagte Gradients, originally in Matlab as part of the Netlab toolbox by I. Nabney, converted to python N. Lawrence and given a pythonic interface by James Hensman
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
# HOLDERS AND CONTRIBUTORS "AS IS" AND ANY # HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
@ -25,7 +25,7 @@
import numpy as np import numpy as np
import sys import sys
def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xtol=1e-6, ftol=1e-6): def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xtol=None, ftol=None, gtol=None):
""" """
Optimisation through Scaled Conjugate Gradients (SCG) Optimisation through Scaled Conjugate Gradients (SCG)
@ -38,19 +38,24 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
flog : a list of all the objective values flog : a list of all the objective values
""" """
if xtol is None:
xtol = 1e-6
if ftol is None:
ftol = 1e-6
if gtol is None:
gtol = 1e-5
sigma0 = 1.0e-4 sigma0 = 1.0e-4
fold = f(x, *optargs) # Initial function value. fold = f(x, *optargs) # Initial function value.
function_eval = 1 function_eval = 1
fnow = fold fnow = fold
gradnew = gradf(x, *optargs) # Initial gradient. gradnew = gradf(x, *optargs) # Initial gradient.
gradold = gradnew.copy() gradold = gradnew.copy()
d = -gradnew # Initial search direction. d = -gradnew # Initial search direction.
success = True # Force calculation of directional derivs. success = True # Force calculation of directional derivs.
nsuccess = 0 # nsuccess counts number of successes. nsuccess = 0 # nsuccess counts number of successes.
beta = 1.0 # Initial scale parameter. beta = 1.0 # Initial scale parameter.
betamin = 1.0e-15 # Lower bound on scale. betamin = 1.0e-15 # Lower bound on scale.
betamax = 1.0e100 # Upper bound on scale. betamax = 1.0e100 # Upper bound on scale.
status = "Not converged" status = "Not converged"
flog = [fold] flog = [fold]
@ -67,21 +72,21 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
d = -gradnew d = -gradnew
mu = np.dot(d, gradnew) mu = np.dot(d, gradnew)
kappa = np.dot(d, d) kappa = np.dot(d, d)
sigma = sigma0/np.sqrt(kappa) sigma = sigma0 / np.sqrt(kappa)
xplus = x + sigma*d xplus = x + sigma * d
gplus = gradf(xplus, *optargs) gplus = gradf(xplus, *optargs)
theta = np.dot(d, (gplus - gradnew))/sigma theta = np.dot(d, (gplus - gradnew)) / sigma
# Increase effective curvature and evaluate step size alpha. # Increase effective curvature and evaluate step size alpha.
delta = theta + beta*kappa delta = theta + beta * kappa
if delta <= 0: if delta <= 0:
delta = beta*kappa delta = beta * kappa
beta = beta - theta/kappa beta = beta - theta / kappa
alpha = - mu/delta alpha = -mu / delta
# Calculate the comparison ratio. # Calculate the comparison ratio.
xnew = x + alpha*d xnew = x + alpha * d
fnew = f(xnew, *optargs) fnew = f(xnew, *optargs)
function_eval += 1 function_eval += 1
@ -89,8 +94,8 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
status = "Maximum number of function evaluations exceeded" status = "Maximum number of function evaluations exceeded"
return x, flog, function_eval, status return x, flog, function_eval, status
Delta = 2.*(fnew - fold)/(alpha*mu) Delta = 2.*(fnew - fold) / (alpha * mu)
if Delta >= 0.: if Delta >= 0.:
success = True success = True
nsuccess += 1 nsuccess += 1
x = xnew x = xnew
@ -100,19 +105,19 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
fnow = fold fnow = fold
# Store relevant variables # Store relevant variables
flog.append(fnow) # Current function value flog.append(fnow) # Current function value
iteration += 1 iteration += 1
if display: if display:
print '\r', print '\r',
print 'Iteration: {0:>5g} Objective:{1:> 12e} Scale:{2:> 12e}'.format(iteration, fnow, beta), print 'i: {0:>5g} f:{1:> 12e} b:{2:> 12e} |g|:{3:> 12e}'.format(iteration, fnow, beta, np.dot(gradnew, gradnew)),
# print 'Iteration:', iteration, ' Objective:', fnow, ' Scale:', beta, '\r', # print 'Iteration:', iteration, ' Objective:', fnow, ' Scale:', beta, '\r',
sys.stdout.flush() sys.stdout.flush()
if success: if success:
# Test for termination # Test for termination
if (np.max(np.abs(alpha*d)) < xtol) or (np.abs(fnew-fold) < ftol): if (np.dot(gradnew, gradnew)) <= gtol or (np.max(np.abs(alpha * d)) < xtol) or (np.abs(fnew - fold) < ftol):
status='converged' status = 'converged'
return x, flog, function_eval, status return x, flog, function_eval, status
else: else:
@ -121,14 +126,15 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
gradold = gradnew gradold = gradnew
gradnew = gradf(x, *optargs) gradnew = gradf(x, *optargs)
# If the gradient is zero then we are done. # If the gradient is zero then we are done.
if np.dot(gradnew,gradnew) == 0: if (np.dot(gradnew, gradnew)) <= gtol:
status = 'converged'
return x, flog, function_eval, status return x, flog, function_eval, status
# Adjust beta according to comparison ratio. # Adjust beta according to comparison ratio.
if Delta < 0.25: if Delta < 0.25:
beta = min(4.0*beta, betamax) beta = min(4.0 * beta, betamax)
if Delta > 0.75: if Delta > 0.75:
beta = max(0.5*beta, betamin) beta = max(0.5 * beta, betamin)
# Update search direction using Polak-Ribiere formula, or re-start # Update search direction using Polak-Ribiere formula, or re-start
# in direction of negative gradient after nparams steps. # in direction of negative gradient after nparams steps.
@ -136,8 +142,8 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
d = -gradnew d = -gradnew
nsuccess = 0 nsuccess = 0
elif success: elif success:
gamma = np.dot(gradold - gradnew,gradnew)/(mu) gamma = np.dot(gradold - gradnew, gradnew) / (mu)
d = gamma*d - gradnew d = gamma * d - gradnew
# If we get here, then we haven't terminated in the given number of # If we get here, then we haven't terminated in the given number of
# iterations. # iterations.

View file

@ -12,6 +12,7 @@ from scipy.optimize.linesearch import line_search_wolfe1, line_search_wolfe2
from threading import Thread from threading import Thread
import numpy import numpy
import sys import sys
import time
RUNNING = "running" RUNNING = "running"
CONVERGED = "converged" CONVERGED = "converged"
@ -71,7 +72,10 @@ class _Async_Optimization(Thread):
def callback_return(self, *a): def callback_return(self, *a):
self.callback(*a) self.callback(*a)
self.callback(self.SENTINEL) if self.outq is not None:
self.outq.put(self.SENTINEL)
if self.messages:
print ""
self.runsignal.clear() self.runsignal.clear()
def run(self, *args, **kwargs): def run(self, *args, **kwargs):
@ -105,7 +109,7 @@ class _CGDAsync(_Async_Optimization):
status = MAX_F_EVAL status = MAX_F_EVAL
gi = -self.df(xi, *a, **kw) gi = -self.df(xi, *a, **kw)
if numpy.dot(gi.T, gi) < self.gtol: if numpy.dot(gi.T, gi) <= self.gtol:
status = CONVERGED status = CONVERGED
break break
if numpy.isnan(numpy.dot(gi.T, gi)): if numpy.isnan(numpy.dot(gi.T, gi)):
@ -123,10 +127,8 @@ class _CGDAsync(_Async_Optimization):
xi, xi,
si, gi, si, gi,
fi, fi_old) fi, fi_old)
if alphai is not None and fi2 < fi: if alphai is None:
fi, fi_old = fi2, fi_old2 alphai, _, _, fi2, fi_old2, gfi = \
else:
alphai, _, _, fi, fi_old, gfi = \
line_search_wolfe2(self.f, self.df, line_search_wolfe2(self.f, self.df,
xi, si, gi, xi, si, gi,
fi, fi_old) fi, fi_old)
@ -134,11 +136,14 @@ class _CGDAsync(_Async_Optimization):
# This line search also failed to find a better solution. # This line search also failed to find a better solution.
status = LINE_SEARCH status = LINE_SEARCH
break break
if fi2 < fi:
fi, fi_old = fi2, fi_old2
if gfi is not None: if gfi is not None:
gi = gfi gi = gfi
if numpy.isnan(fi) or fi_old < fi: if numpy.isnan(fi) or fi_old < fi:
gi, ur, si = self.reset(xi, *a, **kw) gi, ur, si = self.reset(xi, *a, **kw)
else: else:
xi += numpy.dot(alphai, si) xi += numpy.dot(alphai, si)
if self.messages: if self.messages:
@ -164,6 +169,7 @@ class Async_Optimize(object):
try: try:
for ret in iter(lambda: q.get(timeout=1), self.SENTINEL): for ret in iter(lambda: q.get(timeout=1), self.SENTINEL):
self.callback(*ret) self.callback(*ret)
self.runsignal.clear()
except Empty: except Empty:
pass pass
@ -193,13 +199,21 @@ class Async_Optimize(object):
while self.runsignal.is_set(): while self.runsignal.is_set():
try: try:
p.join(1) p.join(1)
# c.join(1) c.join(1)
except KeyboardInterrupt: except KeyboardInterrupt:
# print "^C" # print "^C"
self.runsignal.clear() self.runsignal.clear()
p.join() p.join()
c.join() c.join()
if c and c.is_alive(): if c and c.is_alive():
# self.runsignal.set()
# while self.runsignal.is_set():
# try:
# c.join(.1)
# except KeyboardInterrupt:
# # print "^C"
# self.runsignal.clear()
# c.join()
print "WARNING: callback still running, optimisation done!" print "WARNING: callback still running, optimisation done!"
return p.result return p.result

View file

@ -41,3 +41,13 @@ class FletcherReeves(GDUpdateRule):
if tmp: if tmp:
return tmp / numpy.dot(self.gradold.T, self.gradnatold) return tmp / numpy.dot(self.gradold.T, self.gradnatold)
return tmp return tmp
class PolakRibiere(GDUpdateRule):
'''
Fletcher Reeves update rule for gamma
'''
def _gamma(self, *a, **kw):
tmp = numpy.dot((self.grad - self.gradold).T, self.gradnat)
if tmp:
return tmp / numpy.dot(self.gradold.T, self.gradnatold)
return tmp

View file

@ -29,7 +29,8 @@ class Optimizer():
:rtype: optimizer object. :rtype: optimizer object.
""" """
def __init__(self, x_init, messages=False, model = None, max_f_eval=1e4, max_iters = 1e3, ftol=None, gtol=None, xtol=None): def __init__(self, x_init, messages=False, model=None, max_f_eval=1e4, max_iters=1e3,
ftol=None, gtol=None, xtol=None):
self.opt_name = None self.opt_name = None
self.x_init = x_init self.x_init = x_init
self.messages = messages self.messages = messages
@ -205,7 +206,11 @@ class opt_SCG(Optimizer):
def opt(self, f_fp = None, f = None, fp = None): def opt(self, f_fp = None, f = None, fp = None):
assert not f is None assert not f is None
assert not fp is None assert not fp is None
opt_result = SCG(f,fp,self.x_init, display=self.messages, maxiters=self.max_iters, max_f_eval=self.max_f_eval, xtol=self.xtol, ftol=self.ftol) opt_result = SCG(f, fp, self.x_init, display=self.messages,
maxiters=self.max_iters,
max_f_eval=self.max_f_eval,
xtol=self.xtol, ftol=self.ftol,
gtol=self.gtol)
self.x_opt = opt_result[0] self.x_opt = opt_result[0]
self.trace = opt_result[1] self.trace = opt_result[1]
self.f_opt = self.trace[-1] self.f_opt = self.trace[-1]

View file

@ -9,6 +9,7 @@ from GPy.inference.conjugate_gradient_descent import CGD, RUNNING
import pylab import pylab
import time import time
from scipy.optimize.optimize import rosen, rosen_der from scipy.optimize.optimize import rosen, rosen_der
from GPy.inference.gradient_descent_update_rules import PolakRibiere
class Test(unittest.TestCase): class Test(unittest.TestCase):
@ -71,10 +72,12 @@ if __name__ == "__main__":
# df = lambda x: numpy.dot(A, x) - b # df = lambda x: numpy.dot(A, x) - b
f = rosen f = rosen
df = rosen_der df = rosen_der
x0 = numpy.random.randn(N) * .5 x0 = (numpy.random.randn(N) * .5) + numpy.ones(N)
print x0
opt = CGD() opt = CGD()
pylab.ion()
fig = pylab.figure("cgd optimize") fig = pylab.figure("cgd optimize")
if fig.axes: if fig.axes:
ax = fig.axes[0] ax = fig.axes[0]
@ -83,13 +86,13 @@ if __name__ == "__main__":
ax = fig.add_subplot(111, projection='3d') ax = fig.add_subplot(111, projection='3d')
interpolation = 40 interpolation = 40
x, y = numpy.linspace(-1, 1, interpolation)[:, None], numpy.linspace(-1, 1, interpolation)[:, None] x, y = numpy.linspace(.5, 1.5, interpolation)[:, None], numpy.linspace(.5, 1.5, interpolation)[:, None]
X, Y = numpy.meshgrid(x, y) X, Y = numpy.meshgrid(x, y)
fXY = numpy.array([f(numpy.array([x, y])) for x, y in zip(X.flatten(), Y.flatten())]).reshape(interpolation, interpolation) fXY = numpy.array([f(numpy.array([x, y])) for x, y in zip(X.flatten(), Y.flatten())]).reshape(interpolation, interpolation)
ax.plot_wireframe(X, Y, fXY) ax.plot_wireframe(X, Y, fXY)
xopts = [x0.copy()] xopts = [x0.copy()]
optplts, = ax.plot3D([x0[0]], [x0[1]], zs=f(x0), marker='o', color='r') optplts, = ax.plot3D([x0[0]], [x0[1]], zs=f(x0), marker='', color='r')
raw_input("enter to start optimize") raw_input("enter to start optimize")
res = [0] res = [0]
@ -102,11 +105,7 @@ if __name__ == "__main__":
if r[-1] != RUNNING: if r[-1] != RUNNING:
res[0] = r res[0] = r
p, c = opt.opt_async(f, df, x0.copy(), callback, messages=True, maxiter=1000, res[0] = opt.opt(f, df, x0.copy(), callback, messages=True, maxiter=1000,
report_every=20, gtol=1e-12) report_every=7, gtol=1e-12, update_rule=PolakRibiere)
pylab.ion()
pylab.show()
pass pass