diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py index 32b6c02d..e9e049b0 100644 --- a/GPy/core/__init__.py +++ b/GPy/core/__init__.py @@ -4,6 +4,7 @@ from model import * from parameterised import * import priors -from GPy.core.gp import GP -from GPy.core.sparse_gp import SparseGP +from gp import GP +from sparse_gp import SparseGP from fitc import FITC +from svigp import SVIGP diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py index 0799e0c2..b82f3298 100644 --- a/GPy/core/gp_base.py +++ b/GPy/core/gp_base.py @@ -29,6 +29,7 @@ class GPBase(Model): self._Xscale = np.ones((1, self.input_dim)) super(GPBase, self).__init__() + #Model.__init__(self) # All leaf nodes should call self._set_params(self._get_params()) at # the end diff --git a/GPy/core/svigp.py b/GPy/core/svigp.py new file mode 100644 index 00000000..1db0e26f --- /dev/null +++ b/GPy/core/svigp.py @@ -0,0 +1,466 @@ +# Copyright (c) 2012, James Hensman and Nicolo' Fusi +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +import pylab as pb +from .. import kern +from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs, jitchol, backsub_both_sides +from ..likelihoods import EP +from gp_base import GPBase +from model import Model +import time +import sys + + +class SVIGP(GPBase): + """ + Stochastic Variational inference in a Gaussian Process + + :param X: inputs + :type X: np.ndarray (N x Q) + :param Y: observed data + :type Y: np.ndarray of observations (N x D) + :param batchsize: the size of a h + + Additional kwargs are used as for a sparse GP. They include + + :param q_u: canonical parameters of the distribution squasehd into a 1D array + :type q_u: np.ndarray + :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) + :type M: int + :param kernel : the kernel/covariance function. See link kernels + :type kernel: a GPy kernel + :param Z: inducing inputs (optional, see note) + :type Z: np.ndarray (M x Q) | None + :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance) + :type X_uncertainty: np.ndarray (N x Q) | None + :param Zslices: slices for the inducing inputs (see slicing TODO: link) + :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) + :type M: int + :param beta: noise precision. TODO> ignore beta if doing EP + :type beta: float + :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) + :type normalize_(X|Y): bool + """ + + + def __init__(self, X, likelihood, kernel, Z, q_u=None, batchsize=10, X_variance=None): + GPBase.__init__(self, X, likelihood, kernel, normalize_X=False) + self.batchsize=batchsize + self.Y = self.likelihood.Y.copy() + self.Z = Z + self.num_inducing = Z.shape[0] + + self.batchcounter = 0 + self.epochs = 0 + self.iterations = 0 + + self.vb_steplength = 0.05 + self.param_steplength = 1e-5 + self.momentum = 0.9 + + if X_variance is None: + self.has_uncertain_inputs = False + else: + self.has_uncertain_inputs = True + self.X_variance = X_variance + + + if q_u is None: + q_u = np.hstack((np.random.randn(self.num_inducing*self.output_dim),-.5*np.eye(self.num_inducing).flatten())) + self.set_vb_param(q_u) + + self._permutation = np.random.permutation(self.num_data) + self.load_batch() + + self._param_trace = [] + self._ll_trace = [] + self._grad_trace = [] + + #set the adaptive steplength parameters + self.hbar_t = 0.0 + self.tau_t = 100.0 + self.gbar_t = 0.0 + self.gbar_t1 = 0.0 + self.gbar_t2 = 0.0 + self.hbar_tp = 0.0 + self.tau_tp = 10000.0 + self.gbar_tp = 0.0 + self.adapt_param_steplength = True + self.adapt_vb_steplength = True + self._param_steplength_trace = [] + self._vb_steplength_trace = [] + + def _compute_kernel_matrices(self): + # kernel computations, using BGPLVM notation + self.Kmm = self.kern.K(self.Z) + if self.has_uncertain_inputs: + self.psi0 = self.kern.psi0(self.Z, self.X_batch, self.X_variance_batch) + self.psi1 = self.kern.psi1(self.Z, self.X_batch, self.X_variance_batch) + self.psi2 = self.kern.psi2(self.Z, self.X_batch, self.X_variance_batch) + else: + self.psi0 = self.kern.Kdiag(self.X_batch) + self.psi1 = self.kern.K(self.X_batch, self.Z) + self.psi2 = None + + def dL_dtheta(self): + dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z) + if self.has_uncertain_inputs: + dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X_batch, self.X_variance_batch) + dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X_batch, self.X_variance_batch) + dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X_batch, self.X_variance_batch) + else: + dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X_batch, self.Z) + dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X_batch) + return dL_dtheta + + def _set_params(self, p, computations=True): + self.kern._set_params_transformed(p[:self.kern.num_params]) + self.likelihood._set_params(p[self.kern.num_params:]) + if computations: + self._compute_kernel_matrices() + self._computations() + + def _get_params(self): + return np.hstack((self.kern._get_params_transformed() , self.likelihood._get_params())) + + def _get_param_names(self): + return self.kern._get_param_names_transformed() + self.likelihood._get_param_names() + + def load_batch(self): + """ + load a batch of data (set self.X_batch and self.likelihood.Y from self.X, self.Y) + """ + + #if we've seen all the data, start again with them in a new random order + if self.batchcounter+self.batchsize > self.num_data: + self.batchcounter = 0 + self.epochs += 1 + self._permutation = np.random.permutation(self.num_data) + + this_perm = self._permutation[self.batchcounter:self.batchcounter+self.batchsize] + + self.X_batch = self.X[this_perm] + self.likelihood.set_data(self.Y[this_perm]) + if self.has_uncertain_inputs: + self.X_variance_batch = self.X_variance[this_perm] + + self.batchcounter += self.batchsize + + self.data_prop = float(self.batchsize)/self.num_data + + self._compute_kernel_matrices() + self._computations() + + def _computations(self,do_Kmm=True, do_Kmm_grad=True): + """ + All of the computations needed. Some are optional, see kwargs. + """ + + if do_Kmm: + self.Lm = jitchol(self.Kmm) + + # The rather complex computations of self.A + if self.has_uncertain_inputs: + if self.likelihood.is_heteroscedastic: + psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.batchsize, 1, 1))).sum(0) + else: + psi2_beta = self.psi2.sum(0) * self.likelihood.precision + evals, evecs = linalg.eigh(psi2_beta) + clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable + tmp = evecs * np.sqrt(clipped_evals) + else: + if self.likelihood.is_heteroscedastic: + tmp = self.psi1.T * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.batchsize))) + else: + tmp = self.psi1.T * (np.sqrt(self.likelihood.precision)) + tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1) + self.A = tdot(tmp) + + self.V = self.likelihood.precision*self.likelihood.Y + self.VmT = np.dot(self.V,self.q_u_expectation[0].T) + self.psi1V = np.dot(self.psi1.T, self.V) + + self.B = np.eye(self.num_inducing)*self.data_prop + self.A + self.Lambda = backsub_both_sides(self.Lm, self.B.T) + self.LQL = backsub_both_sides(self.Lm,self.q_u_expectation[1].T,transpose='right') + + self.trace_K = self.psi0.sum() - np.trace(self.A)/self.likelihood.precision + self.Kmmi_m, _ = dpotrs(self.Lm, self.q_u_expectation[0], lower=1) + self.projected_mean = np.dot(self.psi1,self.Kmmi_m) + + # Compute dL_dpsi + self.dL_dpsi0 = - 0.5 * self.output_dim * self.likelihood.precision * np.ones(self.batchsize) + self.dL_dpsi1, _ = dpotrs(self.Lm,np.asfortranarray(self.VmT.T),lower=1) + self.dL_dpsi1 = self.dL_dpsi1.T + + dL_dpsi2 = -0.5 * self.likelihood.precision * backsub_both_sides(self.Lm, self.LQL - self.output_dim * np.eye(self.num_inducing)) + if self.has_uncertain_inputs: + self.dL_dpsi2 = np.repeat(dL_dpsi2[None,:,:],self.batchsize,axis=0) + else: + self.dL_dpsi1 += 2.*np.dot(dL_dpsi2,self.psi1.T).T + self.dL_dpsi2 = None + + # Compute dL_dKmm + if do_Kmm_grad: + tmp = np.dot(self.LQL,self.A) - backsub_both_sides(self.Lm,np.dot(self.q_u_expectation[0],self.psi1V.T),transpose='right') + tmp += tmp.T + tmp += -self.output_dim*self.B + tmp += self.data_prop*self.LQL + self.dL_dKmm = 0.5*backsub_both_sides(self.Lm,tmp) + + #Compute the gradient of the log likelihood wrt noise variance + self.partial_for_likelihood = -0.5*(self.batchsize*self.output_dim - np.sum(self.A*self.LQL))*self.likelihood.precision + self.partial_for_likelihood += (0.5*self.output_dim*self.trace_K + 0.5 * self.likelihood.trYYT - np.sum(self.likelihood.Y*self.projected_mean))*self.likelihood.precision**2 + + + def log_likelihood(self): + """ + As for uncollapsed sparse GP, but account for the proportion of data we're looking at right now. + + NB. self.batchsize is the size of the batch, not the size of X_all + """ + assert not self.likelihood.is_heteroscedastic + A = -0.5*self.batchsize*self.output_dim*(np.log(2.*np.pi) - np.log(self.likelihood.precision)) + B = -0.5*self.likelihood.precision*self.output_dim*self.trace_K + Kmm_logdet = 2.*np.sum(np.log(np.diag(self.Lm))) + C = -0.5*self.output_dim*self.data_prop*(Kmm_logdet-self.q_u_logdet - self.num_inducing) + C += -0.5*np.sum(self.LQL * self.B) + D = -0.5*self.likelihood.precision*self.likelihood.trYYT + E = np.sum(self.V*self.projected_mean) + return (A+B+C+D+E)/self.data_prop + + def _log_likelihood_gradients(self): + return np.hstack((self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))/self.data_prop + + def vb_grad_natgrad(self): + """ + Compute the gradients of the lower bound wrt the canonical and + Expectation parameters of u. + + Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family) + """ + + # Gradient for eta + dL_dmmT_S = -0.5*self.Lambda/self.data_prop + 0.5*self.q_u_prec + Kmmipsi1V,_ = dpotrs(self.Lm,self.psi1V,lower=1) + dL_dm = (Kmmipsi1V - np.dot(self.Lambda,self.q_u_mean))/self.data_prop + + # Gradients for theta + S = self.q_u_cov + Si = self.q_u_prec + m = self.q_u_mean + dL_dSi = -mdot(S,dL_dmmT_S, S) + + dL_dmhSi = -2*dL_dSi + dL_dSim = np.dot(dL_dSi,m) + np.dot(Si, dL_dm) + + return np.hstack((dL_dm.flatten(),dL_dmmT_S.flatten())) , np.hstack((dL_dSim.flatten(), dL_dmhSi.flatten())) + + + def optimize(self, iterations, print_interval=10, callback=lambda:None, callback_interval=5): + + param_step = 0. + + #Iterate! + for i in range(iterations): + + #store the current configuration for plotting later + self._param_trace.append(self._get_params()) + self._ll_trace.append(self.log_likelihood() + self.log_prior()) + + #load a batch + self.load_batch() + + #compute the (stochastic) gradient + natgrads = self.vb_grad_natgrad() + grads = self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()) + self._grad_trace.append(grads) + + #compute the steps in all parameters + vb_step = self.vb_steplength*natgrads[0] + if (self.epochs>=1):#only move the parameters after the first epoch + param_step = self.momentum*param_step + self.param_steplength*grads + else: + param_step = 0. + + self.set_vb_param(self.get_vb_param() + vb_step) + #Note: don't recompute everything here, wait until the next iteration when we have a new batch + self._set_params(self._untransform_params(self._get_params_transformed() + param_step), computations=False) + + #print messages if desired + if i and (not i%print_interval): + print i, np.mean(self._ll_trace[-print_interval:]) #, self.log_likelihood() + print np.round(np.mean(self._grad_trace[-print_interval:],0),3) + sys.stdout.flush() + + #callback + if i and not i%callback_interval: + callback() + time.sleep(0.1) + + if self.epochs > 10: + self._adapt_steplength() + + self.iterations += 1 + + + def _adapt_steplength(self): + if self.adapt_vb_steplength: + # self._adaptive_vb_steplength() + self._adaptive_vb_steplength_KL() + self._vb_steplength_trace.append(self.vb_steplength) + assert self.vb_steplength > 0 + + if self.adapt_param_steplength: + # self._adaptive_param_steplength() + # self._adaptive_param_steplength_log() + self._adaptive_param_steplength_from_vb() + self._param_steplength_trace.append(self.param_steplength) + + def _adaptive_param_steplength(self): + decr_factor = 0.1 + g_tp = self._transform_gradients(self._log_likelihood_gradients()) + self.gbar_tp = (1-1/self.tau_tp)*self.gbar_tp + 1/self.tau_tp * g_tp + self.hbar_tp = (1-1/self.tau_tp)*self.hbar_tp + 1/self.tau_tp * np.dot(g_tp.T, g_tp) + new_param_steplength = np.dot(self.gbar_tp.T, self.gbar_tp) / self.hbar_tp + #- hack + new_param_steplength *= decr_factor + self.param_steplength = (self.param_steplength + new_param_steplength)/2 + #- + self.tau_tp = self.tau_tp*(1-self.param_steplength) + 1 + + def _adaptive_param_steplength_log(self): + stp = np.logspace(np.log(0.0001), np.log(1e-6), base=np.e, num=18000) + self.param_steplength = stp[self.iterations] + + def _adaptive_param_steplength_log2(self): + self.param_steplength = (self.iterations + 0.001)**-0.5 + + def _adaptive_param_steplength_from_vb(self): + self.param_steplength = self.vb_steplength * 0.01 + + def _adaptive_vb_steplength(self): + decr_factor = 0.1 + g_t = self.vb_grad_natgrad()[0] + self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t + self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t) + new_vb_steplength = np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t + #- hack + new_vb_steplength *= decr_factor + self.vb_steplength = (self.vb_steplength + new_vb_steplength)/2 + #- + self.tau_t = self.tau_t*(1-self.vb_steplength) + 1 + + def _adaptive_vb_steplength_KL(self): + decr_factor = 1 #0.1 + natgrad = self.vb_grad_natgrad() + g_t1 = natgrad[0] + g_t2 = natgrad[1] + self.gbar_t1 = (1-1/self.tau_t)*self.gbar_t1 + 1/self.tau_t * g_t1 + self.gbar_t2 = (1-1/self.tau_t)*self.gbar_t2 + 1/self.tau_t * g_t2 + self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t1.T, g_t2) + self.vb_steplength = np.dot(self.gbar_t1.T, self.gbar_t2) / self.hbar_t + self.vb_steplength *= decr_factor + self.tau_t = self.tau_t*(1-self.vb_steplength) + 1 + + def _raw_predict(self, X_new, X_variance_new=None, which_parts='all',full_cov=False): + """Internal helper function for making predictions, does not account for normalization""" + + #TODO: make this more efficient! + self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) + tmp = self.Kmmi- mdot(self.Kmmi,self.q_u_cov,self.Kmmi) + + if X_variance_new is None: + Kx = self.kern.K(X_new,self.Z) + mu = np.dot(Kx,self.Kmmi_m) + if full_cov: + Kxx = self.kern.K(X_new) + var = Kxx - mdot(Kx,tmp,Kx.T) + else: + Kxx = self.kern.Kdiag(X_new) + var = (Kxx - np.sum(Kx*np.dot(Kx,tmp),1))[:,None] + return mu, var + else: + assert X_variance_new.shape == X_new.shape + Kx = self.kern.psi1(self.Z,X_new, X_variance_new) + mu = np.dot(Kx,self.Kmmi_m) + Kxx = self.kern.psi0(self.Z,X_new,X_variance_new) + psi2 = self.kern.psi2(self.Z,X_new,X_variance_new) + diag_var = Kxx - np.sum(np.sum(psi2*tmp[None,:,:],1),1) + if full_cov: + raise NotImplementedError + else: + return mu, diag_var[:,None] + + def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False): + # normalize X values + Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale + if X_variance_new is not None: + X_variance_new = X_variance_new / self._Xscale ** 2 + + # here's the actual prediction by the GP model + mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts) + + # now push through likelihood + mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov) + + return mean, var, _025pm, _975pm + + + def set_vb_param(self,vb_param): + """set the distribution q(u) from the canonical parameters""" + self.q_u_canonical_flat = vb_param.copy() + self.q_u_canonical = self.q_u_canonical_flat[:self.num_inducing*self.output_dim].reshape(self.num_inducing,self.output_dim),self.q_u_canonical_flat[self.num_inducing*self.output_dim:].reshape(self.num_inducing,self.num_inducing) + + self.q_u_prec = -2.*self.q_u_canonical[1] + self.q_u_cov, q_u_Li, q_u_L, tmp = pdinv(self.q_u_prec) + self.q_u_Li = q_u_Li + self.q_u_logdet = -tmp + self.q_u_mean, _ = dpotrs(q_u_Li, np.asfortranarray(self.q_u_canonical[0]),lower=1) + + self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov*self.output_dim) + + + def get_vb_param(self): + """ + Return the canonical parameters of the distribution q(u) + """ + return self.q_u_canonical_flat + + + def plot(self, ax=None, fignum=None, Z_height=None, **kwargs): + + if ax is None: + fig = pb.figure(num=fignum) + ax = fig.add_subplot(111) + + #horrible hack here: + data = self.likelihood.data.copy() + self.likelihood.data = self.Y + GPBase.plot(self, ax=ax, **kwargs) + self.likelihood.data = data + + Zu = self.Z * self._Xscale + self._Xoffset + if self.input_dim==1: + ax.plot(self.X_batch, self.likelihood.data, 'gx',mew=2) + if Z_height is None: + Z_height = ax.get_ylim()[0] + ax.plot(Zu, np.zeros_like(Zu) + Z_height, 'r|', mew=1.5, markersize=12) + + if self.input_dim==2: + ax.scatter(self.X_all[:,0], self.X_all[:,1], 20., self.Y[:,0], linewidth=0, cmap=pb.cm.jet) + ax.plot(Zu[:,0], Zu[:,1], 'w^') + + def plot_traces(self): + pb.figure() + t = np.array(self._param_trace) + pb.subplot(2,1,1) + for l,ti in zip(self._get_param_names(),t.T): + if not l[:3]=='iip': + pb.plot(ti,label=l) + pb.legend(loc=0) + + pb.subplot(2,1,2) + pb.plot(np.asarray(self._ll_trace),label='stochastic likelihood') + pb.legend(loc=0) diff --git a/GPy/examples/__init__.py b/GPy/examples/__init__.py index 0d73daff..2f74858a 100644 --- a/GPy/examples/__init__.py +++ b/GPy/examples/__init__.py @@ -5,3 +5,4 @@ import classification import regression import dimensionality_reduction import tutorials +import stochastic diff --git a/GPy/examples/stochastic.py b/GPy/examples/stochastic.py new file mode 100644 index 00000000..45418d5c --- /dev/null +++ b/GPy/examples/stochastic.py @@ -0,0 +1,41 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import pylab as pb +import numpy as np +import GPy + +def toy_1d(): + N = 2000 + M = 20 + + #create data + X = np.linspace(0,32,N)[:,None] + Z = np.linspace(0,32,M)[:,None] + Y = np.sin(X) + np.cos(0.3*X) + np.random.randn(*X.shape)/np.sqrt(50.) + + m = GPy.models.SVIGPRegression(X,Y, batchsize=10, Z=Z) + m.ensure_default_constraints() + m.constrain_bounded('noise_variance',1e-3,1e-1) + + m.param_steplength = 1e-4 + + fig = pb.figure() + ax = fig.add_subplot(111) + def cb(): + ax.cla() + m.plot(ax=ax,Z_height=-3) + ax.set_ylim(-3,3) + fig.canvas.draw() + + m.optimize(500, callback=cb, callback_interval=1) + + m.plot_traces() + return m + + + + + + + diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index f18e89db..885372a1 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -4,6 +4,7 @@ from gp_regression import GPRegression from gp_classification import GPClassification from sparse_gp_regression import SparseGPRegression +from svigp_regression import SVIGPRegression from sparse_gp_classification import SparseGPClassification from fitc_classification import FITCClassification from gplvm import GPLVM diff --git a/GPy/models/svigp_regression.py b/GPy/models/svigp_regression.py new file mode 100644 index 00000000..8448bf37 --- /dev/null +++ b/GPy/models/svigp_regression.py @@ -0,0 +1,44 @@ +# Copyright (c) 2012, James Hensman +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +from ..core import SVIGP +from .. import likelihoods +from .. import kern + +class SVIGPRegression(SVIGP): + """ + Gaussian Process model for regression + + This is a thin wrapper around the SVIGP class, with a set of sensible defalts + + :param X: input observations + :param Y: observed values + :param kernel: a GPy kernel, defaults to rbf+white + :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_X: False|True + :param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_Y: False|True + :rtype: model object + + .. Note:: Multiple independent outputs are allowed using columns of Y + + """ + + def __init__(self, X, Y, kernel=None, Z=None, num_inducing=10, q_u=None, batchsize=10): + # kern defaults to rbf (plus white for stability) + if kernel is None: + kernel = kern.rbf(X.shape[1], variance=1., lengthscale=4.) + kern.white(X.shape[1], 1e-3) + + # Z defaults to a subset of the data + if Z is None: + i = np.random.permutation(X.shape[0])[:num_inducing] + Z = X[i].copy() + else: + assert Z.shape[1] == X.shape[1] + + # likelihood defaults to Gaussian + likelihood = likelihoods.Gaussian(Y, normalize=False) + + SVIGP.__init__(self, X, likelihood, kernel, Z, q_u=q_u, batchsize=batchsize) + self.load_batch()