GPy/GPy/core/svigp.py

435 lines
18 KiB
Python
Raw Normal View History

# Copyright (c) 2012, James Hensman and Nicolo' Fusi
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np
2013-07-18 15:15:09 +01:00
from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs, jitchol, backsub_both_sides
from gp import GP
import time
import sys
class SVIGP(GP):
"""
Stochastic Variational inference in a Gaussian Process
:param X: inputs
:type X: np.ndarray (N x Q)
:param Y: observed data
:type Y: np.ndarray of observations (N x D)
:param batchsize: the size of a h
Additional kwargs are used as for a sparse GP. They include:
:param q_u: canonical parameters of the distribution sqehd into a 1D array
:type q_u: np.ndarray
:param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
:type M: int
:param kernel: the kernel/covariance function. See link kernels
:type kernel: a GPy kernel
:param Z: inducing inputs (optional, see note)
:type Z: np.ndarray (M x Q) | None
:param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
:type X_uncertainty: np.ndarray (N x Q) | None
:param Zslices: slices for the inducing inputs (see slicing TODO: link)
:param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
:type M: int
:param beta: noise precision. TODO: ignore beta if doing EP
:type beta: float
:param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
:type normalize_(X|Y): bool
"""
2014-09-24 10:07:20 +01:00
def __init__(self, X, Y, kernel, Z, q_u=None, batchsize=10):
raise NotImplementedError, "This is a work in progress, see github issue "
GP.__init__(self, X, Y, kernel)
self.batchsize=batchsize
self.Z = Z
self.num_inducing = Z.shape[0]
self.batchcounter = 0
self.epochs = 0
self.iterations = 0
self.vb_steplength = 0.05
self.param_steplength = 1e-5
self.momentum = 0.9
if q_u is None:
q_u = np.hstack((np.random.randn(self.num_inducing*self.output_dim),-.5*np.eye(self.num_inducing).flatten()))
2014-09-24 10:07:20 +01:00
self.set_vb_param(q_u)
self._permutation = np.random.permutation(self.num_data)
self.load_batch()
self._param_trace = []
self._ll_trace = []
self._grad_trace = []
#set the adaptive steplength parameters
2014-09-24 10:07:20 +01:00
#self.hbar_t = 0.0
#self.tau_t = 100.0
#self.gbar_t = 0.0
#self.gbar_t1 = 0.0
#self.gbar_t2 = 0.0
#self.hbar_tp = 0.0
#self.tau_tp = 10000.0
#self.gbar_tp = 0.0
#self.adapt_param_steplength = True
#self.adapt_vb_steplength = True
#self._param_steplength_trace = []
#self._vb_steplength_trace = []
def _compute_kernel_matrices(self):
# kernel computations, using BGPLVM notation
self.Kmm = self.kern.K(self.Z)
2014-09-24 10:07:20 +01:00
self.psi0 = self.kern.Kdiag(self.X_batch)
self.psi1 = self.kern.K(self.X_batch, self.Z)
self.psi2 = None
def dL_dtheta(self):
2014-02-11 12:20:57 +00:00
dL_dtheta = self.kern._param_grad_helper(self.dL_dKmm, self.Z)
if self.has_uncertain_inputs:
dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X_batch, self.X_variance_batch)
dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X_batch, self.X_variance_batch)
dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X_batch, self.X_variance_batch)
else:
2014-02-11 12:20:57 +00:00
dL_dtheta += self.kern._param_grad_helper(self.dL_dpsi1, self.X_batch, self.Z)
dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X_batch)
return dL_dtheta
def _set_params(self, p, computations=True):
self.kern._set_params_transformed(p[:self.kern.num_params])
self.likelihood._set_params(p[self.kern.num_params:])
if computations:
self._compute_kernel_matrices()
self._computations()
def _get_params(self):
return np.hstack((self.kern._get_params_transformed() , self.likelihood._get_params()))
def _get_param_names(self):
return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
def load_batch(self):
"""
2014-09-24 10:07:20 +01:00
load a batch of data (set self.X_batch and self.Y_batch from self.X, self.Y)
"""
#if we've seen all the data, start again with them in a new random order
if self.batchcounter+self.batchsize > self.num_data:
self.batchcounter = 0
self.epochs += 1
self._permutation = np.random.permutation(self.num_data)
this_perm = self._permutation[self.batchcounter:self.batchcounter+self.batchsize]
self.X_batch = self.X[this_perm]
2014-09-24 10:07:20 +01:00
self.Y_batch = self.Y[this_perm]
self.batchcounter += self.batchsize
self.data_prop = float(self.batchsize)/self.num_data
self._compute_kernel_matrices()
self._computations()
def _computations(self,do_Kmm=True, do_Kmm_grad=True):
"""
All of the computations needed. Some are optional, see kwargs.
"""
if do_Kmm:
self.Lm = jitchol(self.Kmm)
# The rather complex computations of self.A
if self.has_uncertain_inputs:
if self.likelihood.is_heteroscedastic:
psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.batchsize, 1, 1))).sum(0)
else:
psi2_beta = self.psi2.sum(0) * self.likelihood.precision
2013-07-19 10:28:23 +01:00
evals, evecs = np.linalg.eigh(psi2_beta)
clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
tmp = evecs * np.sqrt(clipped_evals)
else:
if self.likelihood.is_heteroscedastic:
tmp = self.psi1.T * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.batchsize)))
else:
tmp = self.psi1.T * (np.sqrt(self.likelihood.precision))
tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
self.A = tdot(tmp)
self.V = self.likelihood.precision*self.likelihood.Y
self.VmT = np.dot(self.V,self.q_u_expectation[0].T)
self.psi1V = np.dot(self.psi1.T, self.V)
self.B = np.eye(self.num_inducing)*self.data_prop + self.A
self.Lambda = backsub_both_sides(self.Lm, self.B.T)
self.LQL = backsub_both_sides(self.Lm,self.q_u_expectation[1].T,transpose='right')
self.trace_K = self.psi0.sum() - np.trace(self.A)/self.likelihood.precision
self.Kmmi_m, _ = dpotrs(self.Lm, self.q_u_expectation[0], lower=1)
self.projected_mean = np.dot(self.psi1,self.Kmmi_m)
# Compute dL_dpsi
self.dL_dpsi0 = - 0.5 * self.output_dim * self.likelihood.precision * np.ones(self.batchsize)
self.dL_dpsi1, _ = dpotrs(self.Lm,np.asfortranarray(self.VmT.T),lower=1)
self.dL_dpsi1 = self.dL_dpsi1.T
dL_dpsi2 = -0.5 * self.likelihood.precision * backsub_both_sides(self.Lm, self.LQL - self.output_dim * np.eye(self.num_inducing))
if self.has_uncertain_inputs:
self.dL_dpsi2 = np.repeat(dL_dpsi2[None,:,:],self.batchsize,axis=0)
else:
self.dL_dpsi1 += 2.*np.dot(dL_dpsi2,self.psi1.T).T
self.dL_dpsi2 = None
# Compute dL_dKmm
if do_Kmm_grad:
tmp = np.dot(self.LQL,self.A) - backsub_both_sides(self.Lm,np.dot(self.q_u_expectation[0],self.psi1V.T),transpose='right')
tmp += tmp.T
tmp += -self.output_dim*self.B
tmp += self.data_prop*self.LQL
self.dL_dKmm = 0.5*backsub_both_sides(self.Lm,tmp)
#Compute the gradient of the log likelihood wrt noise variance
self.partial_for_likelihood = -0.5*(self.batchsize*self.output_dim - np.sum(self.A*self.LQL))*self.likelihood.precision
self.partial_for_likelihood += (0.5*self.output_dim*self.trace_K + 0.5 * self.likelihood.trYYT - np.sum(self.likelihood.Y*self.projected_mean))*self.likelihood.precision**2
def log_likelihood(self):
"""
As for uncollapsed sparse GP, but account for the proportion of data we're looking at right now.
NB. self.batchsize is the size of the batch, not the size of X_all
"""
assert not self.likelihood.is_heteroscedastic
A = -0.5*self.batchsize*self.output_dim*(np.log(2.*np.pi) - np.log(self.likelihood.precision))
B = -0.5*self.likelihood.precision*self.output_dim*self.trace_K
Kmm_logdet = 2.*np.sum(np.log(np.diag(self.Lm)))
C = -0.5*self.output_dim*self.data_prop*(Kmm_logdet-self.q_u_logdet - self.num_inducing)
C += -0.5*np.sum(self.LQL * self.B)
D = -0.5*self.likelihood.precision*self.likelihood.trYYT
E = np.sum(self.V*self.projected_mean)
return (A+B+C+D+E)/self.data_prop
def _log_likelihood_gradients(self):
return np.hstack((self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))/self.data_prop
def vb_grad_natgrad(self):
"""
Compute the gradients of the lower bound wrt the canonical and
Expectation parameters of u.
2014-09-24 10:07:20 +01:00
Note that the natural gradient in either is given by the gradient in
the other (See Hensman et al 2012 Fast Variational inference in the
conjugate exponential Family)
"""
# Gradient for eta
dL_dmmT_S = -0.5*self.Lambda/self.data_prop + 0.5*self.q_u_prec
Kmmipsi1V,_ = dpotrs(self.Lm,self.psi1V,lower=1)
dL_dm = (Kmmipsi1V - np.dot(self.Lambda,self.q_u_mean))/self.data_prop
# Gradients for theta
S = self.q_u_cov
Si = self.q_u_prec
m = self.q_u_mean
dL_dSi = -mdot(S,dL_dmmT_S, S)
dL_dmhSi = -2*dL_dSi
dL_dSim = np.dot(dL_dSi,m) + np.dot(Si, dL_dm)
return np.hstack((dL_dm.flatten(),dL_dmmT_S.flatten())) , np.hstack((dL_dSim.flatten(), dL_dmhSi.flatten()))
def optimize(self, iterations, print_interval=10, callback=lambda:None, callback_interval=5):
param_step = 0.
2013-07-18 15:15:09 +01:00
#Iterate!
for i in range(iterations):
#store the current configuration for plotting later
self._param_trace.append(self._get_params())
self._ll_trace.append(self.log_likelihood() + self.log_prior())
#load a batch
self.load_batch()
#compute the (stochastic) gradient
natgrads = self.vb_grad_natgrad()
grads = self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
self._grad_trace.append(grads)
#compute the steps in all parameters
vb_step = self.vb_steplength*natgrads[0]
if (self.epochs>=1):#only move the parameters after the first epoch
param_step = self.momentum*param_step + self.param_steplength*grads
else:
param_step = 0.
self.set_vb_param(self.get_vb_param() + vb_step)
#Note: don't recompute everything here, wait until the next iteration when we have a new batch
self._set_params(self._untransform_params(self._get_params_transformed() + param_step), computations=False)
2013-07-18 15:15:09 +01:00
#print messages if desired
if i and (not i%print_interval):
print i, np.mean(self._ll_trace[-print_interval:]) #, self.log_likelihood()
print np.round(np.mean(self._grad_trace[-print_interval:],0),3)
sys.stdout.flush()
#callback
if i and not i%callback_interval:
2013-10-08 11:31:06 +01:00
callback(self) # Change this to callback()
time.sleep(0.01)
if self.epochs > 10:
self._adapt_steplength()
self.iterations += 1
def _adapt_steplength(self):
if self.adapt_vb_steplength:
# self._adaptive_vb_steplength()
self._adaptive_vb_steplength_KL()
self._vb_steplength_trace.append(self.vb_steplength)
assert self.vb_steplength > 0
if self.adapt_param_steplength:
2013-10-08 11:31:06 +01:00
self._adaptive_param_steplength()
# self._adaptive_param_steplength_log()
2013-10-08 11:31:06 +01:00
# self._adaptive_param_steplength_from_vb()
self._param_steplength_trace.append(self.param_steplength)
def _adaptive_param_steplength(self):
2013-10-08 11:31:06 +01:00
decr_factor = 0.02
g_tp = self._transform_gradients(self._log_likelihood_gradients())
self.gbar_tp = (1-1/self.tau_tp)*self.gbar_tp + 1/self.tau_tp * g_tp
self.hbar_tp = (1-1/self.tau_tp)*self.hbar_tp + 1/self.tau_tp * np.dot(g_tp.T, g_tp)
new_param_steplength = np.dot(self.gbar_tp.T, self.gbar_tp) / self.hbar_tp
#- hack
new_param_steplength *= decr_factor
self.param_steplength = (self.param_steplength + new_param_steplength)/2
#-
self.tau_tp = self.tau_tp*(1-self.param_steplength) + 1
def _adaptive_param_steplength_log(self):
stp = np.logspace(np.log(0.0001), np.log(1e-6), base=np.e, num=18000)
self.param_steplength = stp[self.iterations]
def _adaptive_param_steplength_log2(self):
self.param_steplength = (self.iterations + 0.001)**-0.5
def _adaptive_param_steplength_from_vb(self):
self.param_steplength = self.vb_steplength * 0.01
def _adaptive_vb_steplength(self):
decr_factor = 0.1
g_t = self.vb_grad_natgrad()[0]
self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t
self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t)
new_vb_steplength = np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t
#- hack
new_vb_steplength *= decr_factor
self.vb_steplength = (self.vb_steplength + new_vb_steplength)/2
#-
self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
def _adaptive_vb_steplength_KL(self):
2013-10-08 11:31:06 +01:00
decr_factor = 0.1
natgrad = self.vb_grad_natgrad()
g_t1 = natgrad[0]
g_t2 = natgrad[1]
self.gbar_t1 = (1-1/self.tau_t)*self.gbar_t1 + 1/self.tau_t * g_t1
self.gbar_t2 = (1-1/self.tau_t)*self.gbar_t2 + 1/self.tau_t * g_t2
self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t1.T, g_t2)
self.vb_steplength = np.dot(self.gbar_t1.T, self.gbar_t2) / self.hbar_t
self.vb_steplength *= decr_factor
self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
def _raw_predict(self, X_new, X_variance_new=None, which_parts='all',full_cov=False):
"""Internal helper function for making predictions, does not account for normalization"""
#TODO: make this more efficient!
self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
tmp = self.Kmmi- mdot(self.Kmmi,self.q_u_cov,self.Kmmi)
if X_variance_new is None:
Kx = self.kern.K(X_new,self.Z)
mu = np.dot(Kx,self.Kmmi_m)
if full_cov:
Kxx = self.kern.K(X_new)
var = Kxx - mdot(Kx,tmp,Kx.T)
else:
Kxx = self.kern.Kdiag(X_new)
var = (Kxx - np.sum(Kx*np.dot(Kx,tmp),1))[:,None]
return mu, var
else:
assert X_variance_new.shape == X_new.shape
Kx = self.kern.psi1(self.Z,X_new, X_variance_new)
mu = np.dot(Kx,self.Kmmi_m)
Kxx = self.kern.psi0(self.Z,X_new,X_variance_new)
psi2 = self.kern.psi2(self.Z,X_new,X_variance_new)
diag_var = Kxx - np.sum(np.sum(psi2*tmp[None,:,:],1),1)
if full_cov:
raise NotImplementedError
else:
return mu, diag_var[:,None]
def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
# normalize X values
Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
if X_variance_new is not None:
X_variance_new = X_variance_new / self._Xscale ** 2
# here's the actual prediction by the GP model
mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)
# now push through likelihood
mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
return mean, var, _025pm, _975pm
def set_vb_param(self,vb_param):
"""set the distribution q(u) from the canonical parameters"""
self.q_u_canonical_flat = vb_param.copy()
self.q_u_canonical = self.q_u_canonical_flat[:self.num_inducing*self.output_dim].reshape(self.num_inducing,self.output_dim),self.q_u_canonical_flat[self.num_inducing*self.output_dim:].reshape(self.num_inducing,self.num_inducing)
self.q_u_prec = -2.*self.q_u_canonical[1]
self.q_u_cov, q_u_Li, q_u_L, tmp = pdinv(self.q_u_prec)
self.q_u_Li = q_u_Li
self.q_u_logdet = -tmp
self.q_u_mean, _ = dpotrs(q_u_Li, np.asfortranarray(self.q_u_canonical[0]),lower=1)
self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov*self.output_dim)
def get_vb_param(self):
"""
Return the canonical parameters of the distribution q(u)
"""
return self.q_u_canonical_flat
2014-01-28 13:40:48 +00:00
def plot(self, *args, **kwargs):
"""
2014-01-28 13:47:59 +00:00
See GPy.plotting.matplot_dep.svig_plots.plot
2014-01-28 13:40:48 +00:00
"""
assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
2014-01-28 13:47:59 +00:00
from ..plotting.matplot_dep import svig_plots
svig_plots.plot(self,*args,**kwargs)
def plot_traces(self):
2014-01-28 13:40:48 +00:00
"""
2014-01-28 13:47:59 +00:00
See GPy.plotting.matplot_dep.svig_plots.plot_traces
2014-01-28 13:40:48 +00:00
"""
assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
2014-01-28 13:47:59 +00:00
from ..plotting.matplot_dep import svig_plots
svig_plots.plot_traces(self)