diff --git a/GPy/inference/optimization/conjugate_gradient_descent.py b/GPy/inference/optimization/conjugate_gradient_descent.py index 8f90b536..dfc4a48d 100644 --- a/GPy/inference/optimization/conjugate_gradient_descent.py +++ b/GPy/inference/optimization/conjugate_gradient_descent.py @@ -1,8 +1,6 @@ -''' -Created on 24 Apr 2013 +# Copyright (c) 2012-2014, Max Zwiessele +# Licensed under the BSD 3-clause license (see LICENSE.txt) -@author: maxz -''' from gradient_descent_update_rules import FletcherReeves, \ PolakRibiere from Queue import Empty diff --git a/GPy/inference/optimization/gradient_descent_update_rules.py b/GPy/inference/optimization/gradient_descent_update_rules.py index 1c14ed63..9536549c 100644 --- a/GPy/inference/optimization/gradient_descent_update_rules.py +++ b/GPy/inference/optimization/gradient_descent_update_rules.py @@ -1,8 +1,6 @@ -''' -Created on 24 Apr 2013 +# Copyright (c) 2012-2014, Max Zwiessele +# Licensed under the BSD 3-clause license (see LICENSE.txt) -@author: maxz -''' import numpy class GDUpdateRule(): diff --git a/GPy/inference/optimization/optimization.py b/GPy/inference/optimization/optimization.py index 45586a1d..f7e1206f 100644 --- a/GPy/inference/optimization/optimization.py +++ b/GPy/inference/optimization/optimization.py @@ -1,4 +1,4 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) import datetime as dt diff --git a/GPy/inference/optimization/scg.py b/GPy/inference/optimization/scg.py index e183b7a8..7efeb781 100644 --- a/GPy/inference/optimization/scg.py +++ b/GPy/inference/optimization/scg.py @@ -1,4 +1,4 @@ -# Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2012) +# Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2014) # Scaled Conjuagte Gradients, originally in Matlab as part of the Netlab toolbox by I. Nabney, converted to python N. Lawrence and given a pythonic interface by James Hensman diff --git a/GPy/inference/optimization/sgd.py b/GPy/inference/optimization/sgd.py deleted file mode 100644 index fd089bf5..00000000 --- a/GPy/inference/optimization/sgd.py +++ /dev/null @@ -1,346 +0,0 @@ -import numpy as np -import scipy as sp -import scipy.sparse -from optimization import Optimizer -from scipy import linalg, optimize -import copy, sys, pickle - -class opt_SGD(Optimizer): - """ - Optimize using stochastic gradient descent. - - :param Model: reference to the Model object - :param iterations: number of iterations - :param learning_rate: learning rate - :param momentum: momentum - - """ - - def __init__(self, start, iterations = 10, learning_rate = 1e-4, momentum = 0.9, model = None, messages = False, batch_size = 1, self_paced = False, center = True, iteration_file = None, learning_rate_adaptation=None, actual_iter=None, schedule=None, **kwargs): - self.opt_name = "Stochastic Gradient Descent" - - self.Model = model - self.iterations = iterations - self.momentum = momentum - self.learning_rate = learning_rate - self.x_opt = None - self.f_opt = None - self.messages = messages - self.batch_size = batch_size - self.self_paced = self_paced - self.center = center - self.param_traces = [('noise',[])] - self.iteration_file = iteration_file - self.learning_rate_adaptation = learning_rate_adaptation - self.actual_iter = actual_iter - if self.learning_rate_adaptation != None: - if self.learning_rate_adaptation == 'annealing': - self.learning_rate_0 = self.learning_rate - else: - self.learning_rate_0 = self.learning_rate.mean() - - self.schedule = schedule - # if len([p for p in self.model.kern.parts if p.name == 'bias']) == 1: - # self.param_traces.append(('bias',[])) - # if len([p for p in self.model.kern.parts if p.name == 'linear']) == 1: - # self.param_traces.append(('linear',[])) - # if len([p for p in self.model.kern.parts if p.name == 'rbf']) == 1: - # self.param_traces.append(('rbf_var',[])) - - self.param_traces = dict(self.param_traces) - self.fopt_trace = [] - - num_params = len(self.Model._get_params()) - if isinstance(self.learning_rate, float): - self.learning_rate = np.ones((num_params,)) * self.learning_rate - - assert (len(self.learning_rate) == num_params), "there must be one learning rate per parameter" - - def __str__(self): - status = "\nOptimizer: \t\t\t %s\n" % self.opt_name - status += "f(x_opt): \t\t\t %.4f\n" % self.f_opt - status += "Number of iterations: \t\t %d\n" % self.iterations - status += "Learning rate: \t\t\t max %.3f, min %.3f\n" % (self.learning_rate.max(), self.learning_rate.min()) - status += "Momentum: \t\t\t %.3f\n" % self.momentum - status += "Batch size: \t\t\t %d\n" % self.batch_size - status += "Time elapsed: \t\t\t %s\n" % self.time - return status - - def plot_traces(self): - """ - See GPy.plotting.matplot_dep.inference_plots - """ - assert "matplotlib" in sys.modules, "matplotlib package has not been imported." - from ..plotting.matplot_dep import inference_plots - inference_plots.plot_sgd_traces(self) - - def non_null_samples(self, data): - return (np.isnan(data).sum(axis=1) == 0) - - def check_for_missing(self, data): - if sp.sparse.issparse(self.Model.likelihood.Y): - return True - else: - return np.isnan(data).sum() > 0 - - def subset_parameter_vector(self, x, samples, param_shapes): - subset = np.array([], dtype = int) - x = np.arange(0, len(x)) - i = 0 - - for s in param_shapes: - N, input_dim = s - X = x[i:i+N*input_dim].reshape(N, input_dim) - X = X[samples] - subset = np.append(subset, X.flatten()) - i += N*input_dim - - subset = np.append(subset, x[i:]) - - return subset - - def shift_constraints(self, j): - - constrained_indices = copy.deepcopy(self.Model.constrained_indices) - - for c, constraint in enumerate(constrained_indices): - mask = (np.ones_like(constrained_indices[c]) == 1) - for i in range(len(constrained_indices[c])): - pos = np.where(j == constrained_indices[c][i])[0] - if len(pos) == 1: - self.Model.constrained_indices[c][i] = pos - else: - mask[i] = False - - self.Model.constrained_indices[c] = self.Model.constrained_indices[c][mask] - return constrained_indices - # back them up - # bounded_i = copy.deepcopy(self.Model.constrained_bounded_indices) - # bounded_l = copy.deepcopy(self.Model.constrained_bounded_lowers) - # bounded_u = copy.deepcopy(self.Model.constrained_bounded_uppers) - - # for b in range(len(bounded_i)): # for each group of constraints - # for bc in range(len(bounded_i[b])): - # pos = np.where(j == bounded_i[b][bc])[0] - # if len(pos) == 1: - # pos2 = np.where(self.Model.constrained_bounded_indices[b] == bounded_i[b][bc])[0][0] - # self.Model.constrained_bounded_indices[b][pos2] = pos[0] - # else: - # if len(self.Model.constrained_bounded_indices[b]) == 1: - # # if it's the last index to be removed - # # the logic here is just a mess. If we remove the last one, then all the - # # b-indices change and we have to iterate through everything to find our - # # current index. Can't deal with this right now. - # raise NotImplementedError - - # else: # just remove it from the indices - # mask = self.Model.constrained_bounded_indices[b] != bc - # self.Model.constrained_bounded_indices[b] = self.Model.constrained_bounded_indices[b][mask] - - - # # here we shif the positive constraints. We cycle through each positive - # # constraint - # positive = self.Model.constrained_positive_indices.copy() - # mask = (np.ones_like(positive) == 1) - # for p in range(len(positive)): - # # we now check whether the constrained index appears in the j vector - # # (the vector of the "active" indices) - # pos = np.where(j == self.Model.constrained_positive_indices[p])[0] - # if len(pos) == 1: - # self.Model.constrained_positive_indices[p] = pos - # else: - # mask[p] = False - # self.Model.constrained_positive_indices = self.Model.constrained_positive_indices[mask] - - # return (bounded_i, bounded_l, bounded_u), positive - - def restore_constraints(self, c):#b, p): - # self.Model.constrained_bounded_indices = b[0] - # self.Model.constrained_bounded_lowers = b[1] - # self.Model.constrained_bounded_uppers = b[2] - # self.Model.constrained_positive_indices = p - self.Model.constrained_indices = c - - def get_param_shapes(self, N = None, input_dim = None): - model_name = self.Model.__class__.__name__ - if model_name == 'GPLVM': - return [(N, input_dim)] - if model_name == 'Bayesian_GPLVM': - return [(N, input_dim), (N, input_dim)] - else: - raise NotImplementedError - - def step_with_missing_data(self, f_fp, X, step, shapes): - N, input_dim = X.shape - - if not sp.sparse.issparse(self.Model.likelihood.Y): - Y = self.Model.likelihood.Y - samples = self.non_null_samples(self.Model.likelihood.Y) - self.Model.N = samples.sum() - Y = Y[samples] - else: - samples = self.Model.likelihood.Y.nonzero()[0] - self.Model.N = len(samples) - Y = np.asarray(self.Model.likelihood.Y[samples].todense(), dtype = np.float64) - - if self.Model.N == 0 or Y.std() == 0.0: - return 0, step, self.Model.N - - self.Model.likelihood._offset = Y.mean() - self.Model.likelihood._scale = Y.std() - self.Model.likelihood.set_data(Y) - # self.Model.likelihood.V = self.Model.likelihood.Y*self.Model.likelihood.precision - - sigma = self.Model.likelihood._variance - self.Model.likelihood._variance = None # invalidate cache - self.Model.likelihood._set_params(sigma) - - - j = self.subset_parameter_vector(self.x_opt, samples, shapes) - self.Model.X = X[samples] - - model_name = self.Model.__class__.__name__ - - if model_name == 'Bayesian_GPLVM': - self.Model.likelihood.YYT = np.dot(self.Model.likelihood.Y, self.Model.likelihood.Y.T) - self.Model.likelihood.trYYT = np.trace(self.Model.likelihood.YYT) - - ci = self.shift_constraints(j) - f, fp = f_fp(self.x_opt[j]) - - step[j] = self.momentum * step[j] + self.learning_rate[j] * fp - self.x_opt[j] -= step[j] - self.restore_constraints(ci) - - self.Model.grads[j] = fp - # restore likelihood _offset and _scale, otherwise when we call set_data(y) on - # the next feature, it will get normalized with the mean and std of this one. - self.Model.likelihood._offset = 0 - self.Model.likelihood._scale = 1 - - return f, step, self.Model.N - - def adapt_learning_rate(self, t, D): - if self.learning_rate_adaptation == 'adagrad': - if t > 0: - g_k = self.Model.grads - self.s_k += np.square(g_k) - t0 = 100.0 - self.learning_rate = 0.1/(t0 + np.sqrt(self.s_k)) - - import pdb; pdb.set_trace() - else: - self.learning_rate = np.zeros_like(self.learning_rate) - self.s_k = np.zeros_like(self.x_opt) - - elif self.learning_rate_adaptation == 'annealing': - #self.learning_rate = self.learning_rate_0/(1+float(t+1)/10) - self.learning_rate = np.ones_like(self.learning_rate) * self.schedule[t] - - - elif self.learning_rate_adaptation == 'semi_pesky': - if self.Model.__class__.__name__ == 'Bayesian_GPLVM': - g_t = self.Model.grads - if t == 0: - self.hbar_t = 0.0 - self.tau_t = 100.0 - self.gbar_t = 0.0 - - self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t - self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t) - self.learning_rate = np.ones_like(self.learning_rate)*(np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t) - tau_t = self.tau_t*(1-self.learning_rate) + 1 - - - def opt(self, f_fp=None, f=None, fp=None): - self.x_opt = self.Model._get_params_transformed() - self.grads = [] - - X, Y = self.Model.X.copy(), self.Model.likelihood.Y.copy() - - self.Model.likelihood.YYT = 0 - self.Model.likelihood.trYYT = 0 - self.Model.likelihood._offset = 0.0 - self.Model.likelihood._scale = 1.0 - - N, input_dim = self.Model.X.shape - D = self.Model.likelihood.Y.shape[1] - num_params = self.Model._get_params() - self.trace = [] - missing_data = self.check_for_missing(self.Model.likelihood.Y) - - step = np.zeros_like(num_params) - for it in range(self.iterations): - if self.actual_iter != None: - it = self.actual_iter - - self.Model.grads = np.zeros_like(self.x_opt) # TODO this is ugly - - if it == 0 or self.self_paced is False: - features = np.random.permutation(Y.shape[1]) - else: - features = np.argsort(NLL) - - b = len(features)/self.batch_size - features = [features[i::b] for i in range(b)] - NLL = [] - for count, j in enumerate(features): - self.Model.input_dim = len(j) - self.Model.likelihood.input_dim = len(j) - self.Model.likelihood.set_data(Y[:, j]) - # self.Model.likelihood.V = self.Model.likelihood.Y*self.Model.likelihood.precision - - sigma = self.Model.likelihood._variance - self.Model.likelihood._variance = None # invalidate cache - self.Model.likelihood._set_params(sigma) - - if missing_data: - shapes = self.get_param_shapes(N, input_dim) - f, step, Nj = self.step_with_missing_data(f_fp, X, step, shapes) - else: - self.Model.likelihood.YYT = np.dot(self.Model.likelihood.Y, self.Model.likelihood.Y.T) - self.Model.likelihood.trYYT = np.trace(self.Model.likelihood.YYT) - Nj = N - f, fp = f_fp(self.x_opt) - self.Model.grads = fp.copy() - step = self.momentum * step + self.learning_rate * fp - self.x_opt -= step - - if self.messages == 2: - noise = self.Model.likelihood._variance - status = "evaluating {feature: 5d}/{tot: 5d} \t f: {f: 2.3f} \t non-missing: {nm: 4d}\t noise: {noise: 2.4f}\r".format(feature = count, tot = len(features), f = f, nm = Nj, noise = noise) - sys.stdout.write(status) - sys.stdout.flush() - self.param_traces['noise'].append(noise) - - self.adapt_learning_rate(it+count, D) - NLL.append(f) - self.fopt_trace.append(NLL[-1]) - - # for k in self.param_traces.keys(): - # self.param_traces[k].append(self.Model.get(k)[0]) - self.grads.append(self.Model.grads.tolist()) - # should really be a sum(), but earlier samples in the iteration will have a very crappy ll - self.f_opt = np.mean(NLL) - self.Model.N = N - self.Model.X = X - self.Model.input_dim = D - self.Model.likelihood.N = N - self.Model.likelihood.input_dim = D - self.Model.likelihood.Y = Y - sigma = self.Model.likelihood._variance - self.Model.likelihood._variance = None # invalidate cache - self.Model.likelihood._set_params(sigma) - - self.trace.append(self.f_opt) - if self.iteration_file is not None: - f = open(self.iteration_file + "iteration%d.pickle" % it, 'w') - data = [self.x_opt, self.fopt_trace, self.param_traces] - pickle.dump(data, f) - f.close() - - if self.messages != 0: - sys.stdout.write('\r' + ' '*len(status)*2 + ' \r') - status = "SGD Iteration: {0: 3d}/{1: 3d} f: {2: 2.3f} max eta: {3: 1.5f}\n".format(it+1, self.iterations, self.f_opt, self.learning_rate.max()) - sys.stdout.write(status) - sys.stdout.flush() diff --git a/GPy/inference/optimization/stochastics.py b/GPy/inference/optimization/stochastics.py index f19c3c2e..dc71d539 100644 --- a/GPy/inference/optimization/stochastics.py +++ b/GPy/inference/optimization/stochastics.py @@ -1,8 +1,5 @@ -''' -Created on 9 Oct 2014 - -@author: maxz -''' +# Copyright (c) 2012-2014, Max Zwiessele +# Licensed under the BSD 3-clause license (see LICENSE.txt) class StochasticStorage(object): ''' @@ -56,4 +53,4 @@ class SparseGPStochastics(StochasticStorage): def reset(self): self.current_dim = -1 - self.d = None \ No newline at end of file + self.d = None