diff --git a/GPy/core/model.py b/GPy/core/model.py index 7a8a3429..76f6dea7 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -256,7 +256,7 @@ class model(parameterised): self._set_params_transformed(x) LL_gradients = self._transform_gradients(self._log_likelihood_gradients()) prior_gradients = self._transform_gradients(self._log_prior_gradients()) - return -LL_gradients - prior_gradients + return - LL_gradients - prior_gradients def objective_and_gradients(self, x): obj_f = self.objective_function(x) diff --git a/GPy/inference/SGD.py b/GPy/inference/SGD.py index a1eb82d1..4fb12b28 100644 --- a/GPy/inference/SGD.py +++ b/GPy/inference/SGD.py @@ -3,6 +3,7 @@ import scipy as sp import scipy.sparse from optimization import Optimizer from scipy import linalg, optimize +import pylab as plt import copy import sys @@ -31,6 +32,16 @@ class opt_SGD(Optimizer): self.batch_size = batch_size self.self_paced = self_paced self.center = center + self.param_traces = [('noise',[])] + if len([p for p in self.model.kern.parts if p.name == 'bias']) == 1: + self.param_traces.append(('bias',[])) + if len([p for p in self.model.kern.parts if p.name == 'linear']) == 1: + self.param_traces.append(('linear',[])) + if len([p for p in self.model.kern.parts if p.name == 'rbf']) == 1: + self.param_traces.append(('rbf_var',[])) + + self.param_traces = dict(self.param_traces) + self.fopt_trace = [] num_params = len(self.model._get_params()) if isinstance(self.learning_rate, float): @@ -48,6 +59,18 @@ class opt_SGD(Optimizer): status += "Time elapsed: \t\t\t %s\n" % self.time return status + def plot_traces(self): + plt.figure() + plt.subplot(211) + plt.title('Parameters') + for k in self.param_traces.keys(): + plt.plot(self.param_traces[k], label=k) + plt.legend(loc=0) + plt.subplot(212) + plt.title('Objective function') + plt.plot(self.fopt_trace) + + def non_null_samples(self, data): return (np.isnan(data).sum(axis=1) == 0) @@ -128,25 +151,37 @@ class opt_SGD(Optimizer): def step_with_missing_data(self, f_fp, X, step, shapes, sparse_matrix): N, Q = X.shape + if not sparse_matrix: + Y = self.model.likelihood.Y samples = self.non_null_samples(self.model.likelihood.Y) self.model.N = samples.sum() - self.model.likelihood.Y = self.model.likelihood.Y[samples] + + if self.center: + self.model.likelihood._mean = Y[samples].mean() + self.model.likelihood._std = Y[samples].std() + + self.model.likelihood.set_data(Y[samples]) else: samples = self.model.likelihood.Y.nonzero()[0] self.model.N = len(samples) - self.model.likelihood.Y = np.asarray(self.model.likelihood.Y[samples].todense(), dtype = np.float64) + Y = np.asarray(self.model.likelihood.Y[samples].todense(), dtype = np.float64) + if self.center: + self.model.likelihood._mean = Y.mean() + self.model.likelihood._std = Y.std() - self.model.likelihood.N = self.model.N + self.model.likelihood.set_data(Y) + + # self.model.likelihood.N = self.model.N j = self.subset_parameter_vector(self.x_opt, samples, shapes) self.model.X = X[samples] if self.model.N == 0 or self.model.likelihood.Y.std() == 0.0: return 0, step, self.model.N - if self.center: - self.model.likelihood.Y -= self.model.likelihood.Y.mean() - self.model.likelihood.Y /= self.model.likelihood.Y.std() + # if self.center: + # self.model.likelihood.Y -= self.model.likelihood.Y.mean() + # self.model.likelihood.Y /= self.model.likelihood.Y.std() model_name = self.model.__class__.__name__ @@ -154,13 +189,13 @@ class opt_SGD(Optimizer): self.model.likelihood.trYYT = np.sum(np.square(self.model.likelihood.Y)) b, p = self.shift_constraints(j) - - momentum_term = self.momentum * step[j] - f, fp = f_fp(self.x_opt[j]) - step[j] = self.learning_rate[j] * fp - self.x_opt[j] -= step[j] + momentum_term + # momentum_term = self.momentum * step[j] + # step[j] = self.learning_rate[j] * fp + # self.x_opt[j] -= step[j] + momentum_term + step[j] = self.momentum * step[j] + self.learning_rate[j] * fp + self.x_opt[j] -= step[j] self.restore_constraints(b, p) return f, step, self.model.N @@ -177,10 +212,14 @@ class opt_SGD(Optimizer): missing_data = self.check_for_missing(self.model.likelihood.Y) self.model.likelihood.YYT = None + self.model.likelihood.trYYT = None + self.model.likelihood._mean = 0.0 + self.model.likelihood._std = 1.0 num_params = self.model._get_params() - step = np.zeros_like(num_params) + step = np.zeros_like(num_params) for it in range(self.iterations): + if it == 0 or self.self_paced is False: features = np.random.permutation(Y.shape[1]) else: @@ -195,17 +234,21 @@ class opt_SGD(Optimizer): for j in features: count += 1 self.model.D = len(j) - self.model.likelihood.Y = Y[:, j] + self.model.likelihood.D = len(j) + self.model.likelihood.set_data(Y[:, j]) if missing_data or sparse_matrix: shapes = self.get_param_shapes(N, Q) f, step, Nj = self.step_with_missing_data(f_fp, X, step, shapes, sparse_matrix) else: Nj = N - momentum_term = self.momentum * step # compute momentum using update(t-1) + # momentum_term = self.momentum * step # compute momentum using update(t-1) f, fp = f_fp(self.x_opt) - step = self.learning_rate * fp # compute update(t) - self.x_opt -= step + momentum_term + # step = self.learning_rate * fp # compute update(t) + # self.x_opt -= step + momentum_term + step = self.momentum * step + self.learning_rate * fp + self.x_opt -= step + if self.messages == 2: noise = np.exp(self.x_opt)[-1] @@ -216,12 +259,19 @@ class opt_SGD(Optimizer): NLL.append(f) + self.fopt_trace.append(f) + for k in self.param_traces.keys(): + self.param_traces[k].append(self.model.get(k)[0]) + + + # should really be a sum(), but earlier samples in the iteration will have a very crappy ll self.f_opt = np.mean(NLL) self.model.N = N self.model.X = X self.model.D = D self.model.likelihood.N = N + self.model.likelihood.D = D self.model.likelihood.Y = Y # self.model.Youter = np.dot(Y, Y.T) diff --git a/GPy/models/GPLVM.py b/GPy/models/GPLVM.py index 32594594..3f7ba113 100644 --- a/GPy/models/GPLVM.py +++ b/GPy/models/GPLVM.py @@ -67,7 +67,13 @@ class GPLVM(GP): """ util.plot.Tango.reset() - + + # this goes against the current standard in GPy, which currently is to not create + # figures in the plot() functions. I think the standard should be changed in order + # to accomodate cases like this + fig = pb.figure() + ax = fig.add_subplot(111) + if labels is None: labels = np.ones(self.N) if which_indices is None: @@ -86,15 +92,17 @@ class GPLVM(GP): input_1, input_2 = np.argsort(k.lengthscale)[:2] elif k.name=='linear': input_1, input_2 = np.argsort(k.variances)[::-1][:2] + else: + input_1, input_2 = which_indices #first, plot the output variance as a function of the latent space Xtest, xx,yy,xmin,xmax = util.plot.x_frame2D(self.X[:,[input_1, input_2]],resolution=resolution) - Xtest_full = np.zeros((Xtest.shape[0], self.X.shape[1])) - Xtest_full[:, :2] = Xtest - mu, var, low, up = self.predict(Xtest_full) - var = var[:, :2] - pb.imshow(var.reshape(resolution,resolution).T[::-1,:],extent=[xmin[0],xmax[0],xmin[1],xmax[1]],cmap=pb.cm.binary,interpolation='bilinear') - + Xtest_full = np.zeros((Xtest.shape[0], self.X.shape[1])) + Xtest_full[:, :2] = Xtest + mu, var, low, up = self.predict(Xtest_full) + var = var[:, :1] # FIXME: this was a :2 + pb.imshow(var.reshape(resolution,resolution).T[::-1,:], + extent=[xmin[0], xmax[0], xmin[1], xmax[1]], cmap=pb.cm.binary,interpolation='bilinear') for i,ul in enumerate(np.unique(labels)): if type(ul) is np.string_: @@ -121,5 +129,6 @@ class GPLVM(GP): pb.xlim(xmin[0],xmax[0]) pb.ylim(xmin[1],xmax[1]) - + pb.grid(b=False) # remove the grid if present, it doesn't look good + ax.set_aspect('auto') # set a nice aspect ratio return input_1, input_2