From 5e7ca787a5156e92ce13abe042c6199ec758b6b9 Mon Sep 17 00:00:00 2001
From: Nicolo Fusi <nicolo.fusi@gmail.com>
Date: Mon, 10 Dec 2012 17:15:08 +0000
Subject: [PATCH] working implementation of SGD

---
 GPy/inference/SGD.py          | 195 ++++++++++++++++++++++++++++++++++
 GPy/inference/optimization.py |  16 +--
 2 files changed, 204 insertions(+), 7 deletions(-)
 create mode 100644 GPy/inference/SGD.py

diff --git a/GPy/inference/SGD.py b/GPy/inference/SGD.py
new file mode 100644
index 00000000..87fa4601
--- /dev/null
+++ b/GPy/inference/SGD.py
@@ -0,0 +1,195 @@
+import numpy as np
+from optimization import Optimizer
+from scipy import linalg, optimize
+import copy
+import sys
+
+class opt_SGD(Optimizer):
+    """
+    Optimize using stochastic gradient descent.
+
+    *** Parameters ***
+    model: reference to the model object
+    iterations: number of iterations
+    learning_rate: learning rate
+    momentum: momentum
+
+    """
+
+    def __init__(self, start, f_fp, f, fp, iterations = 10, learning_rate = 1e-4, momentum = 0.9, model = None, messages = False, batch_size = 1, self_paced = False, **kwargs):
+        self.opt_name = "Stochastic Gradient Descent"
+
+        self.f = f
+        self.fp = fp
+        self.f_fp = f_fp
+        self.model = model
+        self.iterations = iterations
+        self.momentum = momentum
+        self.learning_rate = learning_rate
+        self.x_opt = None
+        self.f_opt = None
+        self.messages = messages
+        self.batch_size = batch_size
+        self.self_paced = self_paced
+
+        num_params = len(self.model.get_param())
+        if isinstance(self.learning_rate, float):
+            self.learning_rate = np.ones((num_params,)) * self.learning_rate
+
+        assert (len(self.learning_rate) == num_params), "there must be one learning rate per parameter"
+
+    def __str__(self):
+        status = "\nOptimizer: \t\t\t %s\n" % self.opt_name
+        status += "f(x_opt): \t\t\t %.4f\n" % self.f_opt
+        status += "Number of iterations: \t\t %d\n" % self.iterations
+        status += "Learning rate: \t\t\t max %.3f, min %.3f\n" % (self.learning_rate.max(), self.learning_rate.min())
+        status += "Momentum: \t\t\t %.3f\n" % self.momentum
+        status += "Batch size: \t\t\t %d\n" % self.batch_size
+        status += "Time elapsed: \t\t\t %s\n" % self.time
+        return status
+
+    def non_null_samples(self, data):
+        return np.isnan(data).sum(axis=1) == 0
+
+    def check_for_missing(self, data):
+        return np.isnan(data).sum() > 0
+
+    def subset_parameter_vector(self, x, samples, param_shapes):
+        subset = np.array([], dtype = int)
+        x = np.arange(0, len(x))
+        i = 0
+
+        for s in param_shapes:
+            N, Q = s
+            X = x[i:N*Q].reshape(N, Q)
+            X = X[samples]
+            subset = np.append(subset, X.flatten())
+            i += N*Q
+
+        subset = np.append(subset, x[i:])
+
+        return subset
+
+    def shift_constraints(self, j):
+        # back them up
+        bounded_i = copy.deepcopy(self.model.constrained_bounded_indices)
+        bounded_l = copy.deepcopy(self.model.constrained_bounded_lowers)
+        bounded_u = copy.deepcopy(self.model.constrained_bounded_uppers)
+
+        for b in range(len(bounded_i)): # for each group of constraints
+            for bc in range(len(bounded_i[b])):
+                pos = np.where(j == bounded_i[b][bc])[0]
+                if len(pos) == 1:
+                    pos2 = np.where(self.model.constrained_bounded_indices[b] == bounded_i[b][bc])[0][0]
+                    self.model.constrained_bounded_indices[b][pos2] = pos[0]
+                else:
+                    if len(self.model.constrained_bounded_indices[b]) == 1:
+                        # if it's the last index to be removed
+                        # the logic here is just a mess. If we remove the last one, then all the
+                        # b-indices change and we have to iterate through everything to find our
+                        # current index. Can't deal with this right now.
+                        raise NotImplementedError
+
+                    else: # just remove it from the indices
+                        mask = self.model.constrained_bounded_indices[b] != bc
+                        self.model.constrained_bounded_indices[b] = self.model.constrained_bounded_indices[b][mask]
+
+
+        positive = self.model.constrained_positive_indices.copy()
+        for p in range(len(positive)):
+            pos = np.where(j == self.model.constrained_positive_indices[p])[0][0]
+            self.model.constrained_positive_indices[p] = pos
+
+        return (bounded_i, bounded_l, bounded_u), positive
+
+    def restore_constraints(self, b, p):
+        self.model.constrained_bounded_indices = b[0]
+        self.model.constrained_bounded_lowers = b[1]
+        self.model.constrained_bounded_uppers = b[2]
+        self.model.constrained_positive_indices = p
+
+    def get_param_shapes(self, N = None, Q = None):
+        model_name = self.model.__class__.__name__
+        if model_name == 'GPLVM':
+            return [(N, Q)]
+        else:
+            raise NotImplementedError
+
+    def step_with_missing_data(self, X, Y, step, shapes):
+        N, Q = X.shape
+        samples = self.non_null_samples(self.model.Y)
+        j = self.subset_parameter_vector(self.x_opt, samples, shapes)
+        self.model.N = samples.sum()
+        self.model.X = X[samples]
+        self.model.Y = self.model.Y[samples]
+
+        if self.model.N == 0:
+            return 0, step, self.model.N
+
+        b,p = self.shift_constraints(j)
+
+        momentum_term = self.momentum * step[j]
+        f, fp = self.f_fp(self.x_opt[j])
+        step[j] = self.learning_rate[j] * fp
+        self.x_opt[j] -= step[j] + momentum_term
+
+        self.restore_constraints(b, p)
+
+        return f, step, self.model.N
+
+    def opt(self):
+        self.x_opt = self.model.get_param()
+        X, Y = self.model.X.copy(), self.model.Y.copy()
+        N, Q = self.model.X.shape
+        D = self.model.Y.shape[1]
+        self.trace = []
+        missing_data = self.check_for_missing(self.model.Y)
+        self.model.Youter = None # this is probably not very efficient
+
+        for it in range(self.iterations):
+            if it == 0 or self.self_paced is False:
+                features = np.random.permutation(Y.shape[1])
+            else:
+                features = np.argsort(LL)[::-1]
+
+            b = len(features)/self.batch_size
+            features = [features[i::b] for i in range(b)]
+            step = np.zeros_like(self.model.get_param())
+            LL = []
+            count = 0
+
+            for j in features:
+                count += 1
+                self.model.D = len(j)
+                self.model.Y = Y[:, j]
+
+                if missing_data:
+                    shapes = self.get_param_shapes(N, Q)
+                    f, step, Nj = self.step_with_missing_data(X, Y, step, shapes)
+                else:
+                    Nj = N
+                    momentum_term = self.momentum * step # compute momentum using update(t-1)
+                    f, fp = self.f_fp(self.x_opt)
+                    step = self.learning_rate * fp # compute update(t)
+                    self.x_opt -= step + momentum_term
+
+                if self.messages == 2:
+                    status = "evaluating {feature: 5d}/{tot: 5d} \t f: {f: 2.3f} \t non-missing: {nm: 4d}\r".format(feature = count, tot = len(features), f = -f, nm = Nj)
+                    sys.stdout.write(status)
+                    sys.stdout.flush()
+
+                LL.append(-f)
+
+            # should really be a sum(), but earlier samples in the iteration will have a very crappy ll
+            self.f_opt = np.mean(LL)
+            self.model.N = N
+            self.model.Y = Y
+            self.model.X = X
+            self.model.D = D
+            # self.model.Youter = np.dot(Y, Y.T)
+            self.trace.append(self.f_opt)
+            if self.messages != 0:
+                sys.stdout.write('\r' + ' '*len(status)*2 + '  \r')
+                status = "SGD Iteration: {0: 3d}/{1: 3d}  f: {2: 2.3f}\n".format(it, self.iterations, self.f_opt)
+                sys.stdout.write(status)
+                sys.stdout.flush()
diff --git a/GPy/inference/optimization.py b/GPy/inference/optimization.py
index c96d2a3b..19d627bc 100644
--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@@ -1,9 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
 from scipy import optimize
-# import rasmussens_minimize as rasm
 import pdb
 import pylab as pb
 import datetime as dt
@@ -19,11 +17,11 @@ class Optimizer():
     :param messages: print messages from the optimizer?
     :type messages: (True | False)
     :param max_f_eval: maximum number of function evaluations
-    
+
     :rtype: optimizer object.
-    
-    """	
-    def __init__(self, x_init, f_fp, f, fp , messages=False, max_f_eval=1e4, ftol=None, gtol=None, xtol=None):
+
+    """
+    def __init__(self, x_init, f_fp, f, fp , messages=False, max_f_eval=1e4, ftol=None, gtol=None, xtol=None, **kwargs):
         self.opt_name = None
         self.f_fp = f_fp
         self.f = f
@@ -193,10 +191,14 @@ class opt_simplex(Optimizer):
 #         self.trace = opt_result[1]
 
 def get_optimizer(f_min):
+    # import rasmussens_minimize as rasm
+    from SGD import opt_SGD
+    
     optimizers = {'fmin_tnc': opt_tnc,
           # 'rasmussen': opt_rasm,
           'simplex': opt_simplex,
-          'lbfgsb': opt_lbfgsb}
+          'lbfgsb': opt_lbfgsb,
+          'sgd': opt_SGD}
 
     for opt_name in optimizers.keys():
         if opt_name.lower().find(f_min.lower()) != -1: