mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-18 13:55:14 +02:00
REFACTORING: model names, lowercase, classes uppercase
This commit is contained in:
parent
2a39440619
commit
2e5e8ac026
50 changed files with 436 additions and 3307 deletions
|
|
@ -1,169 +0,0 @@
|
|||
# Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2012)
|
||||
|
||||
# Scaled Conjuagte Gradients, originally in Matlab as part of the Netlab toolbox by I. Nabney, converted to python N. Lawrence and given a pythonic interface by James Hensman
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
|
||||
# HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
|
||||
# NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
# REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
|
||||
# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
# HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
|
||||
def print_out(len_maxiters, display, fnow, current_grad, beta, iteration):
|
||||
if display:
|
||||
print '\r',
|
||||
print '{0:>0{mi}g} {1:> 12e} {2:> 12e} {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, ' Scale:', beta, '\r',
|
||||
sys.stdout.flush()
|
||||
|
||||
def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xtol=None, ftol=None, gtol=None):
|
||||
"""
|
||||
Optimisation through Scaled Conjugate Gradients (SCG)
|
||||
|
||||
f: the objective function
|
||||
gradf : the gradient function (should return a 1D np.ndarray)
|
||||
x : the initial condition
|
||||
|
||||
Returns
|
||||
x the optimal value for x
|
||||
flog : a list of all the objective values
|
||||
function_eval number of fn evaluations
|
||||
status: string describing convergence status
|
||||
"""
|
||||
if xtol is None:
|
||||
xtol = 1e-6
|
||||
if ftol is None:
|
||||
ftol = 1e-6
|
||||
if gtol is None:
|
||||
gtol = 1e-5
|
||||
sigma0 = 1.0e-8
|
||||
fold = f(x, *optargs) # Initial function value.
|
||||
function_eval = 1
|
||||
fnow = fold
|
||||
gradnew = gradf(x, *optargs) # Initial gradient.
|
||||
current_grad = np.dot(gradnew, gradnew)
|
||||
gradold = gradnew.copy()
|
||||
d = -gradnew # Initial search direction.
|
||||
success = True # Force calculation of directional derivs.
|
||||
nsuccess = 0 # nsuccess counts number of successes.
|
||||
beta = 1.0 # Initial scale parameter.
|
||||
betamin = 1.0e-60 # Lower bound on scale.
|
||||
betamax = 1.0e100 # Upper bound on scale.
|
||||
status = "Not converged"
|
||||
|
||||
flog = [fold]
|
||||
|
||||
iteration = 0
|
||||
|
||||
len_maxiters = len(str(maxiters))
|
||||
if display:
|
||||
print ' {0:{mi}s} {1:11s} {2:11s} {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters)
|
||||
|
||||
# Main optimization loop.
|
||||
while iteration < maxiters:
|
||||
|
||||
# Calculate first and second directional derivatives.
|
||||
if success:
|
||||
mu = np.dot(d, gradnew)
|
||||
if mu >= 0:
|
||||
d = -gradnew
|
||||
mu = np.dot(d, gradnew)
|
||||
kappa = np.dot(d, d)
|
||||
sigma = sigma0 / np.sqrt(kappa)
|
||||
xplus = x + sigma * d
|
||||
gplus = gradf(xplus, *optargs)
|
||||
theta = np.dot(d, (gplus - gradnew)) / sigma
|
||||
|
||||
# Increase effective curvature and evaluate step size alpha.
|
||||
delta = theta + beta * kappa
|
||||
if delta <= 0:
|
||||
delta = beta * kappa
|
||||
beta = beta - theta / kappa
|
||||
|
||||
alpha = -mu / delta
|
||||
|
||||
# Calculate the comparison ratio.
|
||||
xnew = x + alpha * d
|
||||
fnew = f(xnew, *optargs)
|
||||
function_eval += 1
|
||||
|
||||
if function_eval >= max_f_eval:
|
||||
status = "Maximum number of function evaluations exceeded"
|
||||
break
|
||||
# return x, flog, function_eval, status
|
||||
|
||||
Delta = 2.*(fnew - fold) / (alpha * mu)
|
||||
if Delta >= 0.:
|
||||
success = True
|
||||
nsuccess += 1
|
||||
x = xnew
|
||||
fnow = fnew
|
||||
else:
|
||||
success = False
|
||||
fnow = fold
|
||||
|
||||
# Store relevant variables
|
||||
flog.append(fnow) # Current function value
|
||||
|
||||
iteration += 1
|
||||
print_out(len_maxiters, display, fnow, current_grad, beta, iteration)
|
||||
|
||||
if success:
|
||||
# Test for termination
|
||||
if (np.max(np.abs(alpha * d)) < xtol) or (np.abs(fnew - fold) < ftol):
|
||||
status = 'converged'
|
||||
break
|
||||
# return x, flog, function_eval, status
|
||||
|
||||
else:
|
||||
# Update variables for new position
|
||||
gradnew = gradf(x, *optargs)
|
||||
current_grad = np.dot(gradnew, gradnew)
|
||||
gradold = gradnew
|
||||
fold = fnew
|
||||
# If the gradient is zero then we are done.
|
||||
if current_grad <= gtol:
|
||||
status = 'converged'
|
||||
break
|
||||
# return x, flog, function_eval, status
|
||||
|
||||
# Adjust beta according to comparison ratio.
|
||||
if Delta < 0.25:
|
||||
beta = min(4.0 * beta, betamax)
|
||||
if Delta > 0.75:
|
||||
beta = max(0.5 * beta, betamin)
|
||||
|
||||
# Update search direction using Polak-Ribiere formula, or re-start
|
||||
# in direction of negative gradient after nparams steps.
|
||||
if nsuccess == x.size:
|
||||
d = -gradnew
|
||||
# beta = 1. # TODO: betareset!!
|
||||
nsuccess = 0
|
||||
elif success:
|
||||
Gamma = np.dot(gradold - gradnew, gradnew) / (mu)
|
||||
d = Gamma * d - gradnew
|
||||
else:
|
||||
# If we get here, then we haven't terminated in the given number of
|
||||
# iterations.
|
||||
status = "maxiter exceeded"
|
||||
|
||||
if display:
|
||||
print_out(len_maxiters, display, fnow, current_grad, beta, iteration)
|
||||
print ""
|
||||
return x, flog, function_eval, status
|
||||
|
|
@ -1,356 +0,0 @@
|
|||
import numpy as np
|
||||
import scipy as sp
|
||||
import scipy.sparse
|
||||
from optimization import Optimizer
|
||||
from scipy import linalg, optimize
|
||||
import pylab as plt
|
||||
import copy, sys, pickle
|
||||
|
||||
class opt_SGD(Optimizer):
|
||||
"""
|
||||
Optimize using stochastic gradient descent.
|
||||
|
||||
*** Parameters ***
|
||||
model: reference to the model object
|
||||
iterations: number of iterations
|
||||
learning_rate: learning rate
|
||||
momentum: momentum
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, start, iterations = 10, learning_rate = 1e-4, momentum = 0.9, model = None, messages = False, batch_size = 1, self_paced = False, center = True, iteration_file = None, learning_rate_adaptation=None, actual_iter=None, schedule=None, **kwargs):
|
||||
self.opt_name = "Stochastic Gradient Descent"
|
||||
|
||||
self.model = model
|
||||
self.iterations = iterations
|
||||
self.momentum = momentum
|
||||
self.learning_rate = learning_rate
|
||||
self.x_opt = None
|
||||
self.f_opt = None
|
||||
self.messages = messages
|
||||
self.batch_size = batch_size
|
||||
self.self_paced = self_paced
|
||||
self.center = center
|
||||
self.param_traces = [('noise',[])]
|
||||
self.iteration_file = iteration_file
|
||||
self.learning_rate_adaptation = learning_rate_adaptation
|
||||
self.actual_iter = actual_iter
|
||||
if self.learning_rate_adaptation != None:
|
||||
if self.learning_rate_adaptation == 'annealing':
|
||||
self.learning_rate_0 = self.learning_rate
|
||||
else:
|
||||
self.learning_rate_0 = self.learning_rate.mean()
|
||||
|
||||
self.schedule = schedule
|
||||
# if len([p for p in self.model.kern.parts if p.name == 'bias']) == 1:
|
||||
# self.param_traces.append(('bias',[]))
|
||||
# if len([p for p in self.model.kern.parts if p.name == 'linear']) == 1:
|
||||
# self.param_traces.append(('linear',[]))
|
||||
# if len([p for p in self.model.kern.parts if p.name == 'rbf']) == 1:
|
||||
# self.param_traces.append(('rbf_var',[]))
|
||||
|
||||
self.param_traces = dict(self.param_traces)
|
||||
self.fopt_trace = []
|
||||
|
||||
num_params = len(self.model._get_params())
|
||||
if isinstance(self.learning_rate, float):
|
||||
self.learning_rate = np.ones((num_params,)) * self.learning_rate
|
||||
|
||||
assert (len(self.learning_rate) == num_params), "there must be one learning rate per parameter"
|
||||
|
||||
def __str__(self):
|
||||
status = "\nOptimizer: \t\t\t %s\n" % self.opt_name
|
||||
status += "f(x_opt): \t\t\t %.4f\n" % self.f_opt
|
||||
status += "Number of iterations: \t\t %d\n" % self.iterations
|
||||
status += "Learning rate: \t\t\t max %.3f, min %.3f\n" % (self.learning_rate.max(), self.learning_rate.min())
|
||||
status += "Momentum: \t\t\t %.3f\n" % self.momentum
|
||||
status += "Batch size: \t\t\t %d\n" % self.batch_size
|
||||
status += "Time elapsed: \t\t\t %s\n" % self.time
|
||||
return status
|
||||
|
||||
def plot_traces(self):
|
||||
plt.figure()
|
||||
plt.subplot(211)
|
||||
plt.title('Parameters')
|
||||
for k in self.param_traces.keys():
|
||||
plt.plot(self.param_traces[k], label=k)
|
||||
plt.legend(loc=0)
|
||||
plt.subplot(212)
|
||||
plt.title('Objective function')
|
||||
plt.plot(self.fopt_trace)
|
||||
|
||||
|
||||
def non_null_samples(self, data):
|
||||
return (np.isnan(data).sum(axis=1) == 0)
|
||||
|
||||
def check_for_missing(self, data):
|
||||
if sp.sparse.issparse(self.model.likelihood.Y):
|
||||
return True
|
||||
else:
|
||||
return np.isnan(data).sum() > 0
|
||||
|
||||
def subset_parameter_vector(self, x, samples, param_shapes):
|
||||
subset = np.array([], dtype = int)
|
||||
x = np.arange(0, len(x))
|
||||
i = 0
|
||||
|
||||
for s in param_shapes:
|
||||
N, input_dim = s
|
||||
X = x[i:i+N*input_dim].reshape(N, input_dim)
|
||||
X = X[samples]
|
||||
subset = np.append(subset, X.flatten())
|
||||
i += N*input_dim
|
||||
|
||||
subset = np.append(subset, x[i:])
|
||||
|
||||
return subset
|
||||
|
||||
def shift_constraints(self, j):
|
||||
|
||||
constrained_indices = copy.deepcopy(self.model.constrained_indices)
|
||||
|
||||
for c, constraint in enumerate(constrained_indices):
|
||||
mask = (np.ones_like(constrained_indices[c]) == 1)
|
||||
for i in range(len(constrained_indices[c])):
|
||||
pos = np.where(j == constrained_indices[c][i])[0]
|
||||
if len(pos) == 1:
|
||||
self.model.constrained_indices[c][i] = pos
|
||||
else:
|
||||
mask[i] = False
|
||||
|
||||
self.model.constrained_indices[c] = self.model.constrained_indices[c][mask]
|
||||
return constrained_indices
|
||||
# back them up
|
||||
# bounded_i = copy.deepcopy(self.model.constrained_bounded_indices)
|
||||
# bounded_l = copy.deepcopy(self.model.constrained_bounded_lowers)
|
||||
# bounded_u = copy.deepcopy(self.model.constrained_bounded_uppers)
|
||||
|
||||
# for b in range(len(bounded_i)): # for each group of constraints
|
||||
# for bc in range(len(bounded_i[b])):
|
||||
# pos = np.where(j == bounded_i[b][bc])[0]
|
||||
# if len(pos) == 1:
|
||||
# pos2 = np.where(self.model.constrained_bounded_indices[b] == bounded_i[b][bc])[0][0]
|
||||
# self.model.constrained_bounded_indices[b][pos2] = pos[0]
|
||||
# else:
|
||||
# if len(self.model.constrained_bounded_indices[b]) == 1:
|
||||
# # if it's the last index to be removed
|
||||
# # the logic here is just a mess. If we remove the last one, then all the
|
||||
# # b-indices change and we have to iterate through everything to find our
|
||||
# # current index. Can't deal with this right now.
|
||||
# raise NotImplementedError
|
||||
|
||||
# else: # just remove it from the indices
|
||||
# mask = self.model.constrained_bounded_indices[b] != bc
|
||||
# self.model.constrained_bounded_indices[b] = self.model.constrained_bounded_indices[b][mask]
|
||||
|
||||
|
||||
# # here we shif the positive constraints. We cycle through each positive
|
||||
# # constraint
|
||||
# positive = self.model.constrained_positive_indices.copy()
|
||||
# mask = (np.ones_like(positive) == 1)
|
||||
# for p in range(len(positive)):
|
||||
# # we now check whether the constrained index appears in the j vector
|
||||
# # (the vector of the "active" indices)
|
||||
# pos = np.where(j == self.model.constrained_positive_indices[p])[0]
|
||||
# if len(pos) == 1:
|
||||
# self.model.constrained_positive_indices[p] = pos
|
||||
# else:
|
||||
# mask[p] = False
|
||||
# self.model.constrained_positive_indices = self.model.constrained_positive_indices[mask]
|
||||
|
||||
# return (bounded_i, bounded_l, bounded_u), positive
|
||||
|
||||
def restore_constraints(self, c):#b, p):
|
||||
# self.model.constrained_bounded_indices = b[0]
|
||||
# self.model.constrained_bounded_lowers = b[1]
|
||||
# self.model.constrained_bounded_uppers = b[2]
|
||||
# self.model.constrained_positive_indices = p
|
||||
self.model.constrained_indices = c
|
||||
|
||||
def get_param_shapes(self, N = None, input_dim = None):
|
||||
model_name = self.model.__class__.__name__
|
||||
if model_name == 'GPLVM':
|
||||
return [(N, input_dim)]
|
||||
if model_name == 'Bayesian_GPLVM':
|
||||
return [(N, input_dim), (N, input_dim)]
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
def step_with_missing_data(self, f_fp, X, step, shapes):
|
||||
N, input_dim = X.shape
|
||||
|
||||
if not sp.sparse.issparse(self.model.likelihood.Y):
|
||||
Y = self.model.likelihood.Y
|
||||
samples = self.non_null_samples(self.model.likelihood.Y)
|
||||
self.model.N = samples.sum()
|
||||
Y = Y[samples]
|
||||
else:
|
||||
samples = self.model.likelihood.Y.nonzero()[0]
|
||||
self.model.N = len(samples)
|
||||
Y = np.asarray(self.model.likelihood.Y[samples].todense(), dtype = np.float64)
|
||||
|
||||
if self.model.N == 0 or Y.std() == 0.0:
|
||||
return 0, step, self.model.N
|
||||
|
||||
self.model.likelihood._offset = Y.mean()
|
||||
self.model.likelihood._scale = Y.std()
|
||||
self.model.likelihood.set_data(Y)
|
||||
# self.model.likelihood.V = self.model.likelihood.Y*self.model.likelihood.precision
|
||||
|
||||
sigma = self.model.likelihood._variance
|
||||
self.model.likelihood._variance = None # invalidate cache
|
||||
self.model.likelihood._set_params(sigma)
|
||||
|
||||
|
||||
j = self.subset_parameter_vector(self.x_opt, samples, shapes)
|
||||
self.model.X = X[samples]
|
||||
|
||||
model_name = self.model.__class__.__name__
|
||||
|
||||
if model_name == 'Bayesian_GPLVM':
|
||||
self.model.likelihood.YYT = np.dot(self.model.likelihood.Y, self.model.likelihood.Y.T)
|
||||
self.model.likelihood.trYYT = np.trace(self.model.likelihood.YYT)
|
||||
|
||||
ci = self.shift_constraints(j)
|
||||
f, fp = f_fp(self.x_opt[j])
|
||||
|
||||
step[j] = self.momentum * step[j] + self.learning_rate[j] * fp
|
||||
self.x_opt[j] -= step[j]
|
||||
self.restore_constraints(ci)
|
||||
|
||||
self.model.grads[j] = fp
|
||||
# restore likelihood _offset and _scale, otherwise when we call set_data(y) on
|
||||
# the next feature, it will get normalized with the mean and std of this one.
|
||||
self.model.likelihood._offset = 0
|
||||
self.model.likelihood._scale = 1
|
||||
|
||||
return f, step, self.model.N
|
||||
|
||||
def adapt_learning_rate(self, t, D):
|
||||
if self.learning_rate_adaptation == 'adagrad':
|
||||
if t > 0:
|
||||
g_k = self.model.grads
|
||||
self.s_k += np.square(g_k)
|
||||
t0 = 100.0
|
||||
self.learning_rate = 0.1/(t0 + np.sqrt(self.s_k))
|
||||
|
||||
import pdb; pdb.set_trace()
|
||||
else:
|
||||
self.learning_rate = np.zeros_like(self.learning_rate)
|
||||
self.s_k = np.zeros_like(self.x_opt)
|
||||
|
||||
elif self.learning_rate_adaptation == 'annealing':
|
||||
#self.learning_rate = self.learning_rate_0/(1+float(t+1)/10)
|
||||
self.learning_rate = np.ones_like(self.learning_rate) * self.schedule[t]
|
||||
|
||||
|
||||
elif self.learning_rate_adaptation == 'semi_pesky':
|
||||
if self.model.__class__.__name__ == 'Bayesian_GPLVM':
|
||||
g_t = self.model.grads
|
||||
if t == 0:
|
||||
self.hbar_t = 0.0
|
||||
self.tau_t = 100.0
|
||||
self.gbar_t = 0.0
|
||||
|
||||
self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t
|
||||
self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t)
|
||||
self.learning_rate = np.ones_like(self.learning_rate)*(np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t)
|
||||
tau_t = self.tau_t*(1-self.learning_rate) + 1
|
||||
|
||||
|
||||
def opt(self, f_fp=None, f=None, fp=None):
|
||||
self.x_opt = self.model._get_params_transformed()
|
||||
self.grads = []
|
||||
|
||||
X, Y = self.model.X.copy(), self.model.likelihood.Y.copy()
|
||||
|
||||
self.model.likelihood.YYT = 0
|
||||
self.model.likelihood.trYYT = 0
|
||||
self.model.likelihood._offset = 0.0
|
||||
self.model.likelihood._scale = 1.0
|
||||
|
||||
N, input_dim = self.model.X.shape
|
||||
D = self.model.likelihood.Y.shape[1]
|
||||
num_params = self.model._get_params()
|
||||
self.trace = []
|
||||
missing_data = self.check_for_missing(self.model.likelihood.Y)
|
||||
|
||||
step = np.zeros_like(num_params)
|
||||
for it in range(self.iterations):
|
||||
if self.actual_iter != None:
|
||||
it = self.actual_iter
|
||||
|
||||
self.model.grads = np.zeros_like(self.x_opt) # TODO this is ugly
|
||||
|
||||
if it == 0 or self.self_paced is False:
|
||||
features = np.random.permutation(Y.shape[1])
|
||||
else:
|
||||
features = np.argsort(NLL)
|
||||
|
||||
b = len(features)/self.batch_size
|
||||
features = [features[i::b] for i in range(b)]
|
||||
NLL = []
|
||||
import pylab as plt
|
||||
for count, j in enumerate(features):
|
||||
self.model.D = len(j)
|
||||
self.model.likelihood.D = len(j)
|
||||
self.model.likelihood.set_data(Y[:, j])
|
||||
# self.model.likelihood.V = self.model.likelihood.Y*self.model.likelihood.precision
|
||||
|
||||
sigma = self.model.likelihood._variance
|
||||
self.model.likelihood._variance = None # invalidate cache
|
||||
self.model.likelihood._set_params(sigma)
|
||||
|
||||
if missing_data:
|
||||
shapes = self.get_param_shapes(N, input_dim)
|
||||
f, step, Nj = self.step_with_missing_data(f_fp, X, step, shapes)
|
||||
else:
|
||||
self.model.likelihood.YYT = np.dot(self.model.likelihood.Y, self.model.likelihood.Y.T)
|
||||
self.model.likelihood.trYYT = np.trace(self.model.likelihood.YYT)
|
||||
Nj = N
|
||||
f, fp = f_fp(self.x_opt)
|
||||
self.model.grads = fp.copy()
|
||||
step = self.momentum * step + self.learning_rate * fp
|
||||
self.x_opt -= step
|
||||
|
||||
if self.messages == 2:
|
||||
noise = self.model.likelihood._variance
|
||||
status = "evaluating {feature: 5d}/{tot: 5d} \t f: {f: 2.3f} \t non-missing: {nm: 4d}\t noise: {noise: 2.4f}\r".format(feature = count, tot = len(features), f = f, nm = Nj, noise = noise)
|
||||
sys.stdout.write(status)
|
||||
sys.stdout.flush()
|
||||
self.param_traces['noise'].append(noise)
|
||||
|
||||
self.adapt_learning_rate(it+count, D)
|
||||
NLL.append(f)
|
||||
self.fopt_trace.append(NLL[-1])
|
||||
# fig = plt.figure('traces')
|
||||
# plt.clf()
|
||||
# plt.plot(self.param_traces['noise'])
|
||||
|
||||
# for k in self.param_traces.keys():
|
||||
# self.param_traces[k].append(self.model.get(k)[0])
|
||||
self.grads.append(self.model.grads.tolist())
|
||||
# should really be a sum(), but earlier samples in the iteration will have a very crappy ll
|
||||
self.f_opt = np.mean(NLL)
|
||||
self.model.N = N
|
||||
self.model.X = X
|
||||
self.model.D = D
|
||||
self.model.likelihood.N = N
|
||||
self.model.likelihood.D = D
|
||||
self.model.likelihood.Y = Y
|
||||
sigma = self.model.likelihood._variance
|
||||
self.model.likelihood._variance = None # invalidate cache
|
||||
self.model.likelihood._set_params(sigma)
|
||||
|
||||
self.trace.append(self.f_opt)
|
||||
if self.iteration_file is not None:
|
||||
f = open(self.iteration_file + "iteration%d.pickle" % it, 'w')
|
||||
data = [self.x_opt, self.fopt_trace, self.param_traces]
|
||||
pickle.dump(data, f)
|
||||
f.close()
|
||||
|
||||
if self.messages != 0:
|
||||
sys.stdout.write('\r' + ' '*len(status)*2 + ' \r')
|
||||
status = "SGD Iteration: {0: 3d}/{1: 3d} f: {2: 2.3f} max eta: {3: 1.5f}\n".format(it+1, self.iterations, self.f_opt, self.learning_rate.max())
|
||||
sys.stdout.write(status)
|
||||
sys.stdout.flush()
|
||||
|
|
@ -1,18 +1,16 @@
|
|||
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
|
||||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||
|
||||
import pdb
|
||||
import pylab as pb
|
||||
import datetime as dt
|
||||
from scipy import optimize
|
||||
import numpy as np
|
||||
|
||||
try:
|
||||
import rasmussens_minimize as rasm
|
||||
rasm_available = True
|
||||
except ImportError:
|
||||
rasm_available = False
|
||||
from SCG import SCG
|
||||
from scg import SCG
|
||||
|
||||
class Optimizer():
|
||||
"""
|
||||
|
|
@ -51,9 +49,9 @@ class Optimizer():
|
|||
start = dt.datetime.now()
|
||||
self.opt(**kwargs)
|
||||
end = dt.datetime.now()
|
||||
self.time = str(end-start)
|
||||
self.time = str(end - start)
|
||||
|
||||
def opt(self, f_fp = None, f = None, fp = None):
|
||||
def opt(self, f_fp=None, f=None, fp=None):
|
||||
raise NotImplementedError, "this needs to be implemented to use the optimizer class"
|
||||
|
||||
def plot(self):
|
||||
|
|
@ -78,7 +76,7 @@ class opt_tnc(Optimizer):
|
|||
Optimizer.__init__(self, *args, **kwargs)
|
||||
self.opt_name = "TNC (Scipy implementation)"
|
||||
|
||||
def opt(self, f_fp = None, f = None, fp = None):
|
||||
def opt(self, f_fp=None, f=None, fp=None):
|
||||
"""
|
||||
Run the TNC optimizer
|
||||
|
||||
|
|
@ -96,8 +94,8 @@ class opt_tnc(Optimizer):
|
|||
if self.gtol is not None:
|
||||
opt_dict['pgtol'] = self.gtol
|
||||
|
||||
opt_result = optimize.fmin_tnc(f_fp, self.x_init, messages = self.messages,
|
||||
maxfun = self.max_f_eval, **opt_dict)
|
||||
opt_result = optimize.fmin_tnc(f_fp, self.x_init, messages=self.messages,
|
||||
maxfun=self.max_f_eval, **opt_dict)
|
||||
self.x_opt = opt_result[0]
|
||||
self.f_opt = f_fp(self.x_opt)[0]
|
||||
self.funct_eval = opt_result[1]
|
||||
|
|
@ -108,7 +106,7 @@ class opt_lbfgsb(Optimizer):
|
|||
Optimizer.__init__(self, *args, **kwargs)
|
||||
self.opt_name = "L-BFGS-B (Scipy implementation)"
|
||||
|
||||
def opt(self, f_fp = None, f = None, fp = None):
|
||||
def opt(self, f_fp=None, f=None, fp=None):
|
||||
"""
|
||||
Run the optimizer
|
||||
|
||||
|
|
@ -130,8 +128,8 @@ class opt_lbfgsb(Optimizer):
|
|||
if self.gtol is not None:
|
||||
opt_dict['pgtol'] = self.gtol
|
||||
|
||||
opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint = iprint,
|
||||
maxfun = self.max_f_eval, **opt_dict)
|
||||
opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint=iprint,
|
||||
maxfun=self.max_f_eval, **opt_dict)
|
||||
self.x_opt = opt_result[0]
|
||||
self.f_opt = f_fp(self.x_opt)[0]
|
||||
self.funct_eval = opt_result[2]['funcalls']
|
||||
|
|
@ -142,12 +140,12 @@ class opt_simplex(Optimizer):
|
|||
Optimizer.__init__(self, *args, **kwargs)
|
||||
self.opt_name = "Nelder-Mead simplex routine (via Scipy)"
|
||||
|
||||
def opt(self, f_fp = None, f = None, fp = None):
|
||||
def opt(self, f_fp=None, f=None, fp=None):
|
||||
"""
|
||||
The simplex optimizer does not require gradients.
|
||||
"""
|
||||
|
||||
statuses = ['Converged', 'Maximum number of function evaluations made','Maximum number of iterations reached']
|
||||
statuses = ['Converged', 'Maximum number of function evaluations made', 'Maximum number of iterations reached']
|
||||
|
||||
opt_dict = {}
|
||||
if self.xtol is not None:
|
||||
|
|
@ -157,8 +155,8 @@ class opt_simplex(Optimizer):
|
|||
if self.gtol is not None:
|
||||
print "WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it"
|
||||
|
||||
opt_result = optimize.fmin(f, self.x_init, (), disp = self.messages,
|
||||
maxfun = self.max_f_eval, full_output=True, **opt_dict)
|
||||
opt_result = optimize.fmin(f, self.x_init, (), disp=self.messages,
|
||||
maxfun=self.max_f_eval, full_output=True, **opt_dict)
|
||||
|
||||
self.x_opt = opt_result[0]
|
||||
self.f_opt = opt_result[1]
|
||||
|
|
@ -172,7 +170,7 @@ class opt_rasm(Optimizer):
|
|||
Optimizer.__init__(self, *args, **kwargs)
|
||||
self.opt_name = "Rasmussen's Conjugate Gradient"
|
||||
|
||||
def opt(self, f_fp = None, f = None, fp = None):
|
||||
def opt(self, f_fp=None, f=None, fp=None):
|
||||
"""
|
||||
Run Rasmussen's Conjugate Gradient optimizer
|
||||
"""
|
||||
|
|
@ -189,8 +187,8 @@ class opt_rasm(Optimizer):
|
|||
if self.gtol is not None:
|
||||
print "WARNING: minimize doesn't have an gtol arg, so I'm going to ignore it"
|
||||
|
||||
opt_result = rasm.minimize(self.x_init, f_fp, (), messages = self.messages,
|
||||
maxnumfuneval = self.max_f_eval)
|
||||
opt_result = rasm.minimize(self.x_init, f_fp, (), messages=self.messages,
|
||||
maxnumfuneval=self.max_f_eval)
|
||||
self.x_opt = opt_result[0]
|
||||
self.f_opt = opt_result[1][-1]
|
||||
self.funct_eval = opt_result[2]
|
||||
|
|
@ -203,7 +201,7 @@ class opt_SCG(Optimizer):
|
|||
Optimizer.__init__(self, *args, **kwargs)
|
||||
self.opt_name = "Scaled Conjugate Gradients"
|
||||
|
||||
def opt(self, f_fp = None, f = None, fp = None):
|
||||
def opt(self, f_fp=None, f=None, fp=None):
|
||||
assert not f is None
|
||||
assert not fp is None
|
||||
opt_result = SCG(f, fp, self.x_init, display=self.messages,
|
||||
|
|
@ -218,7 +216,7 @@ class opt_SCG(Optimizer):
|
|||
self.status = opt_result[3]
|
||||
|
||||
def get_optimizer(f_min):
|
||||
from SGD import opt_SGD
|
||||
from sgd import opt_SGD
|
||||
|
||||
optimizers = {'fmin_tnc': opt_tnc,
|
||||
'simplex': opt_simplex,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue