GPy/GPy/inference/SGD.py
2013-04-16 15:03:18 +01:00

277 lines
11 KiB
Python

import numpy as np
import scipy as sp
import scipy.sparse
from optimization import Optimizer
from scipy import linalg, optimize
import pylab as plt
import copy, sys, pickle
class opt_SGD(Optimizer):
"""
Optimize using stochastic gradient descent.
*** Parameters ***
model: reference to the model object
iterations: number of iterations
learning_rate: learning rate
momentum: momentum
"""
def __init__(self, start, iterations = 10, learning_rate = 1e-4, momentum = 0.9, model = None, messages = False, batch_size = 1, self_paced = False, center = True, iteration_file = None, **kwargs):
self.opt_name = "Stochastic Gradient Descent"
self.model = model
self.iterations = iterations
self.momentum = momentum
self.learning_rate = learning_rate
self.x_opt = None
self.f_opt = None
self.messages = messages
self.batch_size = batch_size
self.self_paced = self_paced
self.center = center
self.param_traces = [('noise',[])]
self.iteration_file = iteration_file
# if len([p for p in self.model.kern.parts if p.name == 'bias']) == 1:
# self.param_traces.append(('bias',[]))
# if len([p for p in self.model.kern.parts if p.name == 'linear']) == 1:
# self.param_traces.append(('linear',[]))
# if len([p for p in self.model.kern.parts if p.name == 'rbf']) == 1:
# self.param_traces.append(('rbf_var',[]))
self.param_traces = dict(self.param_traces)
self.fopt_trace = []
num_params = len(self.model._get_params())
if isinstance(self.learning_rate, float):
self.learning_rate = np.ones((num_params,)) * self.learning_rate
assert (len(self.learning_rate) == num_params), "there must be one learning rate per parameter"
def __str__(self):
status = "\nOptimizer: \t\t\t %s\n" % self.opt_name
status += "f(x_opt): \t\t\t %.4f\n" % self.f_opt
status += "Number of iterations: \t\t %d\n" % self.iterations
status += "Learning rate: \t\t\t max %.3f, min %.3f\n" % (self.learning_rate.max(), self.learning_rate.min())
status += "Momentum: \t\t\t %.3f\n" % self.momentum
status += "Batch size: \t\t\t %d\n" % self.batch_size
status += "Time elapsed: \t\t\t %s\n" % self.time
return status
def plot_traces(self):
plt.figure()
plt.subplot(211)
plt.title('Parameters')
for k in self.param_traces.keys():
plt.plot(self.param_traces[k], label=k)
plt.legend(loc=0)
plt.subplot(212)
plt.title('Objective function')
plt.plot(self.fopt_trace)
def non_null_samples(self, data):
return (np.isnan(data).sum(axis=1) == 0)
def check_for_missing(self, data):
if sp.sparse.issparse(self.model.likelihood.Y):
return True
else:
return np.isnan(data).sum() > 0
def subset_parameter_vector(self, x, samples, param_shapes):
subset = np.array([], dtype = int)
x = np.arange(0, len(x))
i = 0
for s in param_shapes:
N, Q = s
X = x[i:i+N*Q].reshape(N, Q)
X = X[samples]
subset = np.append(subset, X.flatten())
i += N*Q
subset = np.append(subset, x[i:])
return subset
def shift_constraints(self, j):
# back them up
bounded_i = copy.deepcopy(self.model.constrained_bounded_indices)
bounded_l = copy.deepcopy(self.model.constrained_bounded_lowers)
bounded_u = copy.deepcopy(self.model.constrained_bounded_uppers)
for b in range(len(bounded_i)): # for each group of constraints
for bc in range(len(bounded_i[b])):
pos = np.where(j == bounded_i[b][bc])[0]
if len(pos) == 1:
pos2 = np.where(self.model.constrained_bounded_indices[b] == bounded_i[b][bc])[0][0]
self.model.constrained_bounded_indices[b][pos2] = pos[0]
else:
if len(self.model.constrained_bounded_indices[b]) == 1:
# if it's the last index to be removed
# the logic here is just a mess. If we remove the last one, then all the
# b-indices change and we have to iterate through everything to find our
# current index. Can't deal with this right now.
raise NotImplementedError
else: # just remove it from the indices
mask = self.model.constrained_bounded_indices[b] != bc
self.model.constrained_bounded_indices[b] = self.model.constrained_bounded_indices[b][mask]
# here we shif the positive constraints. We cycle through each positive
# constraint
positive = self.model.constrained_positive_indices.copy()
mask = (np.ones_like(positive) == 1)
for p in range(len(positive)):
# we now check whether the constrained index appears in the j vector
# (the vector of the "active" indices)
pos = np.where(j == self.model.constrained_positive_indices[p])[0]
if len(pos) == 1:
self.model.constrained_positive_indices[p] = pos
else:
mask[p] = False
self.model.constrained_positive_indices = self.model.constrained_positive_indices[mask]
return (bounded_i, bounded_l, bounded_u), positive
def restore_constraints(self, b, p):
self.model.constrained_bounded_indices = b[0]
self.model.constrained_bounded_lowers = b[1]
self.model.constrained_bounded_uppers = b[2]
self.model.constrained_positive_indices = p
def get_param_shapes(self, N = None, Q = None):
model_name = self.model.__class__.__name__
if model_name == 'GPLVM':
return [(N, Q)]
if model_name == 'Bayesian_GPLVM':
return [(N, Q), (N, Q)]
else:
raise NotImplementedError
def step_with_missing_data(self, f_fp, X, step, shapes):
N, Q = X.shape
if not sp.sparse.issparse(self.model.likelihood.Y):
Y = self.model.likelihood.Y
samples = self.non_null_samples(self.model.likelihood.Y)
self.model.N = samples.sum()
Y = Y[samples]
else:
samples = self.model.likelihood.Y.nonzero()[0]
self.model.N = len(samples)
Y = np.asarray(self.model.likelihood.Y[samples].todense(), dtype = np.float64)
if self.model.N == 0 or Y.std() == 0.0:
return 0, step, self.model.N
self.model.likelihood._mean = Y.mean()
self.model.likelihood._std = Y.std()
self.model.likelihood.set_data(Y)
j = self.subset_parameter_vector(self.x_opt, samples, shapes)
self.model.X = X[samples]
model_name = self.model.__class__.__name__
if model_name == 'Bayesian_GPLVM':
self.model.likelihood.YYT = np.dot(self.model.likelihood.Y, self.model.likelihood.Y.T)
self.model.likelihood.trYYT = np.trace(self.model.likelihood.YYT)
b, p = self.shift_constraints(j)
f, fp = f_fp(self.x_opt[j])
step[j] = self.momentum * step[j] + self.learning_rate[j] * fp
self.x_opt[j] -= step[j]
self.restore_constraints(b, p)
# restore likelihood _mean and _std, otherwise when we call set_data(y) on
# the next feature, it will get normalized with the mean and std of this one.
self.model.likelihood._mean = 0
self.model.likelihood._std = 1
return f, step, self.model.N
def opt(self, f_fp=None, f=None, fp=None):
self.x_opt = self.model._get_params_transformed()
X, Y = self.model.X.copy(), self.model.likelihood.Y.copy()
self.model.likelihood.YYT = None
self.model.likelihood.trYYT = None
self.model.likelihood._mean = 0.0
self.model.likelihood._std = 1.0
N, Q = self.model.X.shape
D = self.model.likelihood.Y.shape[1]
num_params = self.model._get_params()
self.trace = []
missing_data = self.check_for_missing(self.model.likelihood.Y)
step = np.zeros_like(num_params)
for it in range(self.iterations):
if it == 0 or self.self_paced is False:
features = np.random.permutation(Y.shape[1])
else:
features = np.argsort(NLL)
b = len(features)/self.batch_size
features = [features[i::b] for i in range(b)]
NLL = []
import pylab as plt
for count, j in enumerate(features):
self.model.D = len(j)
self.model.likelihood.D = len(j)
self.model.likelihood.set_data(Y[:, j])
if missing_data:
shapes = self.get_param_shapes(N, Q)
f, step, Nj = self.step_with_missing_data(f_fp, X, step, shapes)
else:
self.model.likelihood.YYT = np.dot(self.model.likelihood.Y, self.model.likelihood.Y.T)
self.model.likelihood.trYYT = np.trace(self.model.likelihood.YYT)
Nj = N
f, fp = f_fp(self.x_opt)
step = self.momentum * step + self.learning_rate * fp
self.x_opt -= step
if self.messages == 2:
noise = self.model.likelihood._variance
status = "evaluating {feature: 5d}/{tot: 5d} \t f: {f: 2.3f} \t non-missing: {nm: 4d}\t noise: {noise: 2.4f}\r".format(feature = count, tot = len(features), f = f, nm = Nj, noise = noise)
sys.stdout.write(status)
sys.stdout.flush()
self.param_traces['noise'].append(noise)
NLL.append(f)
self.fopt_trace.append(f)
# fig = plt.figure('traces')
# plt.clf()
# plt.plot(self.param_traces['noise'])
# import pdb; pdb.set_trace()
# for k in self.param_traces.keys():
# self.param_traces[k].append(self.model.get(k)[0])
# should really be a sum(), but earlier samples in the iteration will have a very crappy ll
self.f_opt = np.mean(NLL)
self.model.N = N
self.model.X = X
self.model.D = D
self.model.likelihood.N = N
self.model.likelihood.D = D
self.model.likelihood.Y = Y
self.trace.append(self.f_opt)
if self.iteration_file is not None:
f = open(self.iteration_file + "iteration%d.pickle" % it, 'w')
data = [self.x_opt, self.fopt_trace, self.param_traces]
pickle.dump(data, f)
f.close()
if self.messages != 0:
sys.stdout.write('\r' + ' '*len(status)*2 + ' \r')
status = "SGD Iteration: {0: 3d}/{1: 3d} f: {2: 2.3f}\n".format(it+1, self.iterations, self.f_opt)
sys.stdout.write(status)
sys.stdout.flush()