Merge pull request #542 from SheffieldML/EP_refactor

Merge in the EP refactoring
This commit is contained in:
Zhenwen Dai 2017-09-06 10:08:08 +01:00 committed by GitHub
commit 0fb273a41c
9 changed files with 721 additions and 202 deletions

View file

@ -6,9 +6,111 @@ from paramz import ObsAr
from . import ExactGaussianInference, VarDTC from . import ExactGaussianInference, VarDTC
from ...util import diag from ...util import diag
from .posterior import PosteriorEP as Posterior from .posterior import PosteriorEP as Posterior
from ...likelihoods import Gaussian
log_2_pi = np.log(2*np.pi) log_2_pi = np.log(2*np.pi)
#Four wrapper classes to help modularisation of different EP versions
class marginalMoments(object):
def __init__(self, num_data):
self.Z_hat = np.empty(num_data,dtype=np.float64)
self.mu_hat = np.empty(num_data,dtype=np.float64)
self.sigma2_hat = np.empty(num_data,dtype=np.float64)
class cavityParams(object):
def __init__(self, num_data):
self.tau = np.empty(num_data,dtype=np.float64)
self.v = np.empty(num_data,dtype=np.float64)
def _update_i(self, eta, ga_approx, post_params, i):
self.tau[i] = 1./post_params.Sigma_diag[i] - eta*ga_approx.tau[i]
self.v[i] = post_params.mu[i]/post_params.Sigma_diag[i] - eta*ga_approx.v[i]
class gaussianApproximation(object):
def __init__(self, v, tau):
self.tau = tau
self.v = v
def _update_i(self, eta, delta, post_params, marg_moments, i):
#Site parameters update
delta_tau = delta/eta*(1./marg_moments.sigma2_hat[i] - 1./post_params.Sigma_diag[i])
delta_v = delta/eta*(marg_moments.mu_hat[i]/marg_moments.sigma2_hat[i] - post_params.mu[i]/post_params.Sigma_diag[i])
tau_tilde_prev = self.tau[i]
self.tau[i] += delta_tau
# Enforce positivity of tau_tilde. Even though this is guaranteed for logconcave sites, it is still possible
# to get negative values due to numerical errors. Moreover, the value of tau_tilde should be positive in order to
# update the marginal likelihood without runnint into instabilities issues.
if self.tau[i] < np.finfo(float).eps:
self.tau[i] = np.finfo(float).eps
delta_tau = self.tau[i] - tau_tilde_prev
self.v[i] += delta_v
return (delta_tau, delta_v)
class posteriorParamsBase(object):
def __init__(self, mu, Sigma_diag):
self.mu = mu
self.Sigma_diag = Sigma_diag
def _update_rank1(self, *arg):
pass
def _recompute(self, *arg):
pass
class posteriorParams(posteriorParamsBase):
def __init__(self, mu, Sigma, L=None):
self.Sigma = Sigma
self.L = L
Sigma_diag = np.diag(self.Sigma)
super(posteriorParams, self).__init__(mu, Sigma_diag)
def _update_rank1(self, delta_tau, ga_approx, i):
ci = delta_tau/(1.+ delta_tau*self.Sigma_diag[i])
DSYR(self.Sigma, self.Sigma[:,i].copy(), -ci)
self.mu = np.dot(self.Sigma, ga_approx.v)
@staticmethod
def _recompute(K, ga_approx):
num_data = len(ga_approx.tau)
tau_tilde_root = np.sqrt(ga_approx.tau)
Sroot_tilde_K = tau_tilde_root[:,None] * K
B = np.eye(num_data) + Sroot_tilde_K * tau_tilde_root[None,:]
L = jitchol(B)
V, _ = dtrtrs(L, Sroot_tilde_K, lower=1)
Sigma = K - np.dot(V.T,V) #K - KS^(1/2)BS^(1/2)K = (K^(-1) + \Sigma^(-1))^(-1)
mu = np.dot(Sigma,ga_approx.v)
return posteriorParams(mu=mu, Sigma=Sigma, L=L)
class posteriorParamsDTC(posteriorParamsBase):
def __init__(self, mu, Sigma_diag):
super(posteriorParamsDTC, self).__init__(mu, Sigma_diag)
def _update_rank1(self, LLT, Kmn, delta_v, delta_tau, i):
#DSYR(Sigma, Sigma[:,i].copy(), -delta_tau/(1.+ delta_tau*Sigma[i,i]))
DSYR(LLT,Kmn[:,i].copy(),delta_tau)
L = jitchol(LLT)
V,info = dtrtrs(L,Kmn,lower=1)
self.Sigma_diag = np.maximum(np.sum(V*V,-2), np.finfo(float).eps) #diag(K_nm (L L^\top)^(-1)) K_mn
si = np.sum(V.T*V[:,i],-1) #(V V^\top)[:,i]
self.mu += (delta_v-delta_tau*self.mu[i])*si
#mu = np.dot(Sigma, v_tilde)
@staticmethod
def _recompute(LLT0, Kmn, ga_approx):
LLT = LLT0 + np.dot(Kmn*ga_approx.tau[None,:],Kmn.T)
L = jitchol(LLT)
V, _ = dtrtrs(L,Kmn,lower=1)
#Sigma_diag = np.sum(V*V,-2)
#Knmv_tilde = np.dot(Kmn,v_tilde)
#mu = np.dot(V2.T,Knmv_tilde)
Sigma = np.dot(V.T,V)
mu = np.dot(Sigma, ga_approx.v)
Sigma_diag = np.diag(Sigma).copy()
return posteriorParamsDTC(mu, Sigma_diag), LLT
class EPBase(object): class EPBase(object):
def __init__(self, epsilon=1e-6, eta=1., delta=1., always_reset=False, max_iters=np.inf, ep_mode="alternated", parallel_updates=False): def __init__(self, epsilon=1e-6, eta=1., delta=1., always_reset=False, max_iters=np.inf, ep_mode="alternated", parallel_updates=False):
""" """
@ -28,6 +130,7 @@ class EPBase(object):
:parallel_updates: boolean. If true, updates of the parameters of the sites in parallel :parallel_updates: boolean. If true, updates of the parameters of the sites in parallel
""" """
super(EPBase, self).__init__() super(EPBase, self).__init__()
self.always_reset = always_reset self.always_reset = always_reset
self.epsilon, self.eta, self.delta, self.max_iters = epsilon, eta, delta, max_iters self.epsilon, self.eta, self.delta, self.max_iters = epsilon, eta, delta, max_iters
self.ep_mode = ep_mode self.ep_mode = ep_mode
@ -35,7 +138,7 @@ class EPBase(object):
self.reset() self.reset()
def reset(self): def reset(self):
self.old_mutilde, self.old_vtilde = None, None self.ga_approx_old = None
self._ep_approximation = None self._ep_approximation = None
def on_optimization_start(self): def on_optimization_start(self):
@ -45,6 +148,11 @@ class EPBase(object):
# TODO: update approximation in the end as well? Maybe even with a switch? # TODO: update approximation in the end as well? Maybe even with a switch?
pass pass
def _stop_criteria(self, ga_approx):
tau_diff = np.mean(np.square(ga_approx.tau-self.ga_approx_old.tau))
v_diff = np.mean(np.square(ga_approx.v-self.ga_approx_old.v))
return ((tau_diff < self.epsilon) and (v_diff < self.epsilon))
def __setstate__(self, state): def __setstate__(self, state):
super(EPBase, self).__setstate__(state[0]) super(EPBase, self).__setstate__(state[0])
self.epsilon, self.eta, self.delta = state[1] self.epsilon, self.eta, self.delta = state[1]
@ -67,19 +175,18 @@ class EP(EPBase, ExactGaussianInference):
if self.ep_mode=="nested": if self.ep_mode=="nested":
#Force EP at each step of the optimization #Force EP at each step of the optimization
self._ep_approximation = None self._ep_approximation = None
mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata) post_params, ga_approx, cav_params, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
elif self.ep_mode=="alternated": elif self.ep_mode=="alternated":
if getattr(self, '_ep_approximation', None) is None: if getattr(self, '_ep_approximation', None) is None:
#if we don't yet have the results of runnign EP, run EP and store the computed factors in self._ep_approximation #if we don't yet have the results of runnign EP, run EP and store the computed factors in self._ep_approximation
mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata) post_params, ga_approx, cav_params, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
else: else:
#if we've already run EP, just use the existing approximation stored in self._ep_approximation #if we've already run EP, just use the existing approximation stored in self._ep_approximation
mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation post_params, ga_approx, cav_params, log_Z_tilde = self._ep_approximation
else: else:
raise ValueError("ep_mode value not valid") raise ValueError("ep_mode value not valid")
v_tilde = mu_tilde * tau_tilde return self._inference(Y, K, ga_approx, cav_params, likelihood, Y_metadata=Y_metadata, Z_tilde=log_Z_tilde)
return self._inference(K, tau_tilde, v_tilde, likelihood, Y_metadata=Y_metadata, Z_tilde=log_Z_tilde.sum())
def expectation_propagation(self, K, Y, likelihood, Y_metadata): def expectation_propagation(self, K, Y, likelihood, Y_metadata):
@ -90,41 +197,57 @@ class EP(EPBase, ExactGaussianInference):
# than ObsArrays # than ObsArrays
Y = Y.values.copy() Y = Y.values.copy()
#Initial values - Marginal moments #Initial values - Marginal moments, cavity params, gaussian approximation params and posterior params
Z_hat = np.empty(num_data,dtype=np.float64) marg_moments = marginalMoments(num_data)
mu_hat = np.empty(num_data,dtype=np.float64) cav_params = cavityParams(num_data)
sigma2_hat = np.empty(num_data,dtype=np.float64) ga_approx, post_params = self._init_approximations(K, num_data)
tau_cav = np.empty(num_data,dtype=np.float64) #Approximation
v_cav = np.empty(num_data,dtype=np.float64) stop = False
iterations = 0
while not stop and (iterations < self.max_iters):
self._local_updates(num_data, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata)
#(re) compute Sigma and mu using full Cholesky decompy
post_params = posteriorParams._recompute(K, ga_approx)
#monitor convergence
if iterations > 0:
stop = self._stop_criteria(ga_approx)
self.ga_approx_old = gaussianApproximation(ga_approx.v.copy(), ga_approx.tau.copy())
iterations += 1
# Z_tilde after removing the terms that can lead to infinite terms due to tau_tilde close to zero.
# This terms cancel with the coreresponding terms in the marginal loglikelihood
log_Z_tilde = self._log_Z_tilde(marg_moments, ga_approx, cav_params)
# - 0.5*np.log(tau_tilde) + 0.5*(v_tilde*v_tilde*1./tau_tilde)
return (post_params, ga_approx, cav_params, log_Z_tilde)
def _init_approximations(self, K, num_data):
#initial values - Gaussian factors #initial values - Gaussian factors
#Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma) #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
if self.old_mutilde is None: if self.ga_approx_old is None:
tau_tilde, mu_tilde, v_tilde = np.zeros((3, num_data)) v_tilde, tau_tilde = np.zeros((2, num_data))
ga_approx = gaussianApproximation(v_tilde, tau_tilde)
Sigma = K.copy() Sigma = K.copy()
diag.add(Sigma, 1e-7) diag.add(Sigma, 1e-7)
mu = np.zeros(num_data) mu = np.zeros(num_data)
post_params = posteriorParams(mu, Sigma)
else: else:
assert self.old_mutilde.size == num_data, "data size mis-match: did you change the data? try resetting!" assert self.ga_approx_old.v.size == num_data, "data size mis-match: did you change the data? try resetting!"
mu_tilde, v_tilde = self.old_mutilde, self.old_vtilde ga_approx = gaussianApproximation(self.ga_approx_old.v, self.ga_approx_old.tau)
tau_tilde = v_tilde/mu_tilde post_params = posteriorParams._recompute(K, ga_approx)
mu, Sigma, _ = self._ep_compute_posterior(K, tau_tilde, v_tilde) diag.add(post_params.Sigma, 1e-7)
diag.add(Sigma, 1e-7)
# TODO: Check the log-marginal under both conditions and choose the best one # TODO: Check the log-marginal under both conditions and choose the best one
return (ga_approx, post_params)
#Approximation def _local_updates(self, num_data, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata, update_order=None):
tau_diff = self.epsilon + 1. if update_order is None:
v_diff = self.epsilon + 1. update_order = np.random.permutation(num_data)
tau_tilde_old = np.nan
v_tilde_old = np.nan
iterations = 0
while ((tau_diff > self.epsilon) or (v_diff > self.epsilon)) and (iterations < self.max_iters):
update_order = np.random.permutation(num_data)
for i in update_order: for i in update_order:
#Cavity distribution parameters #Cavity distribution parameters
tau_cav[i] = 1./Sigma[i,i] - self.eta*tau_tilde[i] cav_params._update_i(self.eta, ga_approx, post_params, i)
v_cav[i] = mu[i]/Sigma[i,i] - self.eta*v_tilde[i]
if Y_metadata is not None: if Y_metadata is not None:
# Pick out the relavent metadata for Yi # Pick out the relavent metadata for Yi
Y_metadata_i = {} Y_metadata_i = {}
@ -133,91 +256,45 @@ class EP(EPBase, ExactGaussianInference):
else: else:
Y_metadata_i = None Y_metadata_i = None
#Marginal moments #Marginal moments
Z_hat[i], mu_hat[i], sigma2_hat[i] = likelihood.moments_match_ep(Y[i], tau_cav[i], v_cav[i], Y_metadata_i=Y_metadata_i) marg_moments.Z_hat[i], marg_moments.mu_hat[i], marg_moments.sigma2_hat[i] = likelihood.moments_match_ep(Y[i], cav_params.tau[i], cav_params.v[i], Y_metadata_i=Y_metadata_i)
#Site parameters update
delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
tau_tilde_prev = tau_tilde[i]
tau_tilde[i] += delta_tau
# Enforce positivity of tau_tilde. Even though this is guaranteed for logconcave sites, it is still possible #Site parameters update
# to get negative values due to numerical errors. Moreover, the value of tau_tilde should be positive in order to delta_tau, delta_v = ga_approx._update_i(self.eta, self.delta, post_params, marg_moments, i)
# update the marginal likelihood without inestability issues.
if tau_tilde[i] < np.finfo(float).eps:
tau_tilde[i] = np.finfo(float).eps
delta_tau = tau_tilde[i] - tau_tilde_prev
v_tilde[i] += delta_v
if self.parallel_updates == False: if self.parallel_updates == False:
#Posterior distribution parameters update post_params._update_rank1(delta_tau, ga_approx, i)
ci = delta_tau/(1.+ delta_tau*Sigma[i,i])
DSYR(Sigma, Sigma[:,i].copy(), -ci)
mu = np.dot(Sigma, v_tilde)
#(re) compute Sigma and mu using full Cholesky decompy def _log_Z_tilde(self, marg_moments, ga_approx, cav_params):
mu, Sigma, _ = self._ep_compute_posterior(K, tau_tilde, v_tilde) return np.sum((np.log(marg_moments.Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(1+ga_approx.tau/cav_params.tau) - 0.5 * ((ga_approx.v)**2 * 1./(cav_params.tau + ga_approx.tau))
+ 0.5*(cav_params.v * ( ( (ga_approx.tau/cav_params.tau) * cav_params.v - 2.0 * ga_approx.v ) * 1./(cav_params.tau + ga_approx.tau)))))
#monitor convergence
if iterations > 0:
tau_diff = np.mean(np.square(tau_tilde-tau_tilde_old))
v_diff = np.mean(np.square(v_tilde-v_tilde_old))
tau_tilde_old = tau_tilde.copy()
v_tilde_old = v_tilde.copy()
iterations += 1
mu_tilde = v_tilde/tau_tilde def _ep_marginal(self, K, ga_approx, Z_tilde):
mu_cav = v_cav/tau_cav post_params = posteriorParams._recompute(K, ga_approx)
sigma2_sigma2tilde = 1./tau_cav + 1./tau_tilde
# Z_tilde after removing the terms that can lead to infinite terms due to tau_tilde close to zero.
# This terms cancel with the coreresponding terms in the marginal loglikelihood
log_Z_tilde = (np.log(Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(1+tau_tilde/tau_cav)
- 0.5 * ((v_tilde)**2 * 1./(tau_cav + tau_tilde)) + 0.5*(v_cav * ( ( (tau_tilde/tau_cav) * v_cav - 2.0 * v_tilde ) * 1./(tau_cav + tau_tilde))))
# - 0.5*np.log(tau_tilde) + 0.5*(v_tilde*v_tilde*1./tau_tilde)
self.old_mutilde = mu_tilde
self.old_vtilde = v_tilde
return mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde
def _ep_compute_posterior(self, K, tau_tilde, v_tilde):
num_data = len(tau_tilde)
tau_tilde_root = np.sqrt(tau_tilde)
Sroot_tilde_K = tau_tilde_root[:,None] * K
B = np.eye(num_data) + Sroot_tilde_K * tau_tilde_root[None,:]
L = jitchol(B)
V, _ = dtrtrs(L, Sroot_tilde_K, lower=1)
Sigma = K - np.dot(V.T,V) #K - KS^(1/2)BS^(1/2)K = (K^(-1) + \Sigma^(-1))^(-1)
mu = np.dot(Sigma,v_tilde)
return (mu, Sigma, L)
def _ep_marginal(self, K, tau_tilde, v_tilde, Z_tilde):
mu, Sigma, L = self._ep_compute_posterior(K, tau_tilde, v_tilde)
# Gaussian log marginal excluding terms that can go to infinity due to arbitrarily small tau_tilde. # Gaussian log marginal excluding terms that can go to infinity due to arbitrarily small tau_tilde.
# These terms cancel out with the terms excluded from Z_tilde # These terms cancel out with the terms excluded from Z_tilde
B_logdet = np.sum(2.0*np.log(np.diag(L))) B_logdet = np.sum(2.0*np.log(np.diag(post_params.L)))
log_marginal = 0.5*(-len(tau_tilde) * log_2_pi - B_logdet + np.sum(v_tilde * np.dot(Sigma,v_tilde))) log_marginal = 0.5*(-len(ga_approx.tau) * log_2_pi - B_logdet + np.sum(ga_approx.v * np.dot(post_params.Sigma,ga_approx.v)))
log_marginal += Z_tilde log_marginal += Z_tilde
return log_marginal, mu, Sigma, L return log_marginal, post_params
def _inference(self, K, tau_tilde, v_tilde, likelihood, Z_tilde, Y_metadata=None): def _inference(self, Y, K, ga_approx, cav_params, likelihood, Z_tilde, Y_metadata=None):
log_marginal, mu, Sigma, L = self._ep_marginal(K, tau_tilde, v_tilde, Z_tilde) log_marginal, post_params = self._ep_marginal(K, ga_approx, Z_tilde)
tau_tilde_root = np.sqrt(tau_tilde) tau_tilde_root = np.sqrt(ga_approx.tau)
Sroot_tilde_K = tau_tilde_root[:,None] * K Sroot_tilde_K = tau_tilde_root[:,None] * K
aux_alpha , _ = dpotrs(L, np.dot(Sroot_tilde_K, v_tilde), lower=1) aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1)
alpha = (v_tilde - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde) alpha = (ga_approx.v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde)
LWi, _ = dtrtrs(L, np.diag(tau_tilde_root), lower=1) LWi, _ = dtrtrs(post_params.L, np.diag(tau_tilde_root), lower=1)
Wi = np.dot(LWi.T,LWi) Wi = np.dot(LWi.T,LWi)
symmetrify(Wi) #(K + Sigma^(\tilde))^(-1) symmetrify(Wi) #(K + Sigma^(\tilde))^(-1)
dL_dK = 0.5 * (tdot(alpha) - Wi) dL_dK = 0.5 * (tdot(alpha) - Wi)
dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata) dL_dthetaL = likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='gh')
return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha} return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
@ -244,24 +321,25 @@ class EPDTC(EPBase, VarDTC):
if self.ep_mode=="nested": if self.ep_mode=="nested":
#Force EP at each step of the optimization #Force EP at each step of the optimization
self._ep_approximation = None self._ep_approximation = None
mu, Sigma_diag, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata) post_params, ga_approx, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
elif self.ep_mode=="alternated": elif self.ep_mode=="alternated":
if getattr(self, '_ep_approximation', None) is None: if getattr(self, '_ep_approximation', None) is None:
#if we don't yet have the results of runnign EP, run EP and store the computed factors in self._ep_approximation #if we don't yet have the results of runnign EP, run EP and store the computed factors in self._ep_approximation
mu, Sigma_diag, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata) post_params, ga_approx, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
else: else:
#if we've already run EP, just use the existing approximation stored in self._ep_approximation #if we've already run EP, just use the existing approximation stored in self._ep_approximation
mu, Sigma_diag, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation post_params, ga_approx, log_Z_tilde = self._ep_approximation
else: else:
raise ValueError("ep_mode value not valid") raise ValueError("ep_mode value not valid")
return super(EPDTC, self).inference(kern, X, Z, likelihood, mu_tilde, mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
return super(EPDTC, self).inference(kern, X, Z, likelihood, ObsAr(mu_tilde[:,None]),
mean_function=mean_function, mean_function=mean_function,
Y_metadata=Y_metadata, Y_metadata=Y_metadata,
precision=tau_tilde, precision=ga_approx.tau,
Lm=Lm, dL_dKmm=dL_dKmm, Lm=Lm, dL_dKmm=dL_dKmm,
psi0=psi0, psi1=psi1, psi2=psi2, Z_tilde=log_Z_tilde.sum()) psi0=psi0, psi1=psi1, psi2=psi2, Z_tilde=log_Z_tilde)
def expectation_propagation(self, Kmm, Kmn, Y, likelihood, Y_metadata): def expectation_propagation(self, Kmm, Kmn, Y, likelihood, Y_metadata):
@ -272,119 +350,91 @@ class EPDTC(EPBase, VarDTC):
# than ObsArrays # than ObsArrays
Y = Y.values.copy() Y = Y.values.copy()
#Initial values - Marginal moments #Initial values - Marginal moments, cavity params, gaussian approximation params and posterior params
Z_hat = np.zeros(num_data,dtype=np.float64) marg_moments = marginalMoments(num_data)
mu_hat = np.zeros(num_data,dtype=np.float64) cav_params = cavityParams(num_data)
sigma2_hat = np.zeros(num_data,dtype=np.float64) ga_approx, post_params, LLT0, LLT = self._init_approximations(Kmm, Kmn, num_data)
tau_cav = np.empty(num_data,dtype=np.float64) #Approximation
v_cav = np.empty(num_data,dtype=np.float64) stop = False
iterations = 0
while not stop and (iterations < self.max_iters):
self._local_updates(num_data, LLT0, LLT, Kmn, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata)
#(re) compute Sigma, Sigma_diag and mu using full Cholesky decompy
post_params, LLT = posteriorParamsDTC._recompute(LLT0, Kmn, ga_approx)
post_params.Sigma_diag = np.maximum(post_params.Sigma_diag, np.finfo(float).eps)
#monitor convergence
if iterations > 0:
stop = self._stop_criteria(ga_approx)
self.ga_approx_old = gaussianApproximation(ga_approx.v.copy(), ga_approx.tau.copy())
iterations += 1
log_Z_tilde = self._log_Z_tilde(marg_moments, ga_approx, cav_params)
return post_params, ga_approx, log_Z_tilde
def _log_Z_tilde(self, marg_moments, ga_approx, cav_params):
mu_tilde = ga_approx.v/ga_approx.tau
mu_cav = cav_params.v/cav_params.tau
sigma2_sigma2tilde = 1./cav_params.tau + 1./ga_approx.tau
return np.sum((np.log(marg_moments.Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(sigma2_sigma2tilde)
+ 0.5*((mu_cav - mu_tilde)**2) / (sigma2_sigma2tilde)))
def _init_approximations(self, Kmm, Kmn, num_data):
#initial values - Gaussian factors #initial values - Gaussian factors
#Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma) #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
LLT0 = Kmm.copy() LLT0 = Kmm.copy()
Lm = jitchol(LLT0) #K_m = L_m L_m^\top Lm = jitchol(LLT0) #K_m = L_m L_m^\top
Vm,info = dtrtrs(Lm,Kmn,lower=1) Vm,info = dtrtrs(Lm, Kmn,lower=1)
# Lmi = dtrtri(Lm) # Lmi = dtrtri(Lm)
# Kmmi = np.dot(Lmi.T,Lmi) # Kmmi = np.dot(Lmi.T,Lmi)
# KmmiKmn = np.dot(Kmmi,Kmn) # KmmiKmn = np.dot(Kmmi,Kmn)
# Qnn_diag = np.sum(Kmn*KmmiKmn,-2) # Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
Qnn_diag = np.sum(Vm*Vm,-2) #diag(Knm Kmm^(-1) Kmn) Qnn_diag = np.sum(Vm*Vm,-2) #diag(Knm Kmm^(-1) Kmn)
#diag.add(LLT0, 1e-8) #diag.add(LLT0, 1e-8)
if self.old_mutilde is None: if self.ga_approx_old is None:
#Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma) #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
LLT = LLT0.copy() #Sigma = K.copy() LLT = LLT0.copy() #Sigma = K.copy()
mu = np.zeros(num_data) mu = np.zeros(num_data)
Sigma_diag = Qnn_diag.copy() + 1e-8 Sigma_diag = Qnn_diag.copy() + 1e-8
tau_tilde, mu_tilde, v_tilde = np.zeros((3, num_data)) v_tilde, tau_tilde = np.zeros((2, num_data))
ga_approx = gaussianApproximation(v_tilde, tau_tilde)
post_params = posteriorParamsDTC(mu, Sigma_diag)
else: else:
assert self.old_mutilde.size == num_data, "data size mis-match: did you change the data? try resetting!" assert self.ga_approx_old.v.size == num_data, "data size mis-match: did you change the data? try resetting!"
mu_tilde, v_tilde = self.old_mutilde, self.old_vtilde ga_approx = gaussianApproximation(self.ga_approx_old.v, self.ga_approx_old.tau)
tau_tilde = v_tilde/mu_tilde post_params, LLT = posteriorParamsDTC._recompute(LLT0, Kmn, ga_approx)
mu, Sigma_diag, LLT = self._ep_compute_posterior(LLT0, Kmn, tau_tilde, v_tilde) post_params.Sigma_diag += 1e-8
Sigma_diag += 1e-8
# TODO: Check the log-marginal under both conditions and choose the best one # TODO: Check the log-marginal under both conditions and choose the best one
#Approximation return (ga_approx, post_params, LLT0, LLT)
tau_diff = self.epsilon + 1.
v_diff = self.epsilon + 1. def _local_updates(self, num_data, LLT0, LLT, Kmn, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata, update_order=None):
tau_tilde_old = np.nan if update_order is None:
v_tilde_old = np.nan
iterations = 0
while ((tau_diff > self.epsilon) or (v_diff > self.epsilon)) and (iterations < self.max_iters):
update_order = np.random.permutation(num_data) update_order = np.random.permutation(num_data)
for i in update_order: for i in update_order:
#Cavity distribution parameters
tau_cav[i] = 1./Sigma_diag[i] - self.eta*tau_tilde[i]
v_cav[i] = mu[i]/Sigma_diag[i] - self.eta*v_tilde[i]
if Y_metadata is not None:
# Pick out the relavent metadata for Yi
Y_metadata_i = {}
for key in Y_metadata.keys():
Y_metadata_i[key] = Y_metadata[key][i, :]
else:
Y_metadata_i = None
#Marginal moments #Cavity distribution parameters
Z_hat[i], mu_hat[i], sigma2_hat[i] = likelihood.moments_match_ep(Y[i], tau_cav[i], v_cav[i], Y_metadata_i=Y_metadata_i) cav_params._update_i(self.eta, ga_approx, post_params, i)
#Site parameters update
delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
tau_tilde_prev = tau_tilde[i]
tau_tilde[i] += delta_tau
# Enforce positivity of tau_tilde. Even though this is guaranteed for logconcave sites, it is still possible
# to get negative values due to numerical errors. Moreover, the value of tau_tilde should be positive in order to
# update the marginal likelihood without inestability issues.
if tau_tilde[i] < np.finfo(float).eps:
tau_tilde[i] = np.finfo(float).eps
delta_tau = tau_tilde[i] - tau_tilde_prev
v_tilde[i] += delta_v
#Posterior distribution parameters update if Y_metadata is not None:
if self.parallel_updates == False: # Pick out the relavent metadata for Yi
#DSYR(Sigma, Sigma[:,i].copy(), -delta_tau/(1.+ delta_tau*Sigma[i,i])) Y_metadata_i = {}
DSYR(LLT,Kmn[:,i].copy(),delta_tau) for key in Y_metadata.keys():
L = jitchol(LLT) Y_metadata_i[key] = Y_metadata[key][i, :]
V,info = dtrtrs(L,Kmn,lower=1) else:
Sigma_diag = np.maximum(np.sum(V*V,-2), np.finfo(float).eps) #diag(K_nm (L L^\top)^(-1)) K_mn Y_metadata_i = None
si = np.sum(V.T*V[:,i],-1) #(V V^\top)[:,i]
mu += (delta_v-delta_tau*mu[i])*si
#mu = np.dot(Sigma, v_tilde)
#(re) compute Sigma, Sigma_diag and mu using full Cholesky decompy #Marginal moments
mu, Sigma_diag, LLT = self._ep_compute_posterior(LLT0, Kmn, tau_tilde, v_tilde) marg_moments.Z_hat[i], marg_moments.mu_hat[i], marg_moments.sigma2_hat[i] = likelihood.moments_match_ep(Y[i], cav_params.tau[i], cav_params.v[i], Y_metadata_i=Y_metadata_i)
Sigma_diag = np.maximum(Sigma_diag, np.finfo(float).eps) #Site parameters update
delta_tau, delta_v = ga_approx._update_i(self.eta, self.delta, post_params, marg_moments, i)
#monitor convergence #Posterior distribution parameters update
if iterations>0: if self.parallel_updates == False:
tau_diff = np.mean(np.square(tau_tilde-tau_tilde_old)) post_params._update_rank1(LLT, Kmn, delta_v, delta_tau, i)
v_diff = np.mean(np.square(v_tilde-v_tilde_old))
tau_tilde_old = tau_tilde.copy()
v_tilde_old = v_tilde.copy()
iterations += 1
mu_tilde = v_tilde/tau_tilde
mu_cav = v_cav/tau_cav
sigma2_sigma2tilde = 1./tau_cav + 1./tau_tilde
log_Z_tilde = (np.log(Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(sigma2_sigma2tilde)
+ 0.5*((mu_cav - mu_tilde)**2) / (sigma2_sigma2tilde))
self.old_mutilde = mu_tilde
self.old_vtilde = v_tilde
return mu, Sigma_diag, ObsAr(mu_tilde[:,None]), tau_tilde, log_Z_tilde
def _ep_compute_posterior(self, LLT0, Kmn, tau_tilde, v_tilde):
LLT = LLT0 + np.dot(Kmn*tau_tilde[None,:],Kmn.T)
L = jitchol(LLT)
V, _ = dtrtrs(L,Kmn,lower=1)
#Sigma_diag = np.sum(V*V,-2)
#Knmv_tilde = np.dot(Kmn,v_tilde)
#mu = np.dot(V2.T,Knmv_tilde)
Sigma = np.dot(V.T,V)
mu = np.dot(Sigma,v_tilde)
Sigma_diag = np.diag(Sigma).copy()
return (mu, Sigma_diag, LLT)

View file

@ -66,7 +66,14 @@ class Binomial(Likelihood):
np.testing.assert_array_equal(N.shape, y.shape) np.testing.assert_array_equal(N.shape, y.shape)
nchoosey = special.gammaln(N+1) - special.gammaln(y+1) - special.gammaln(N-y+1) nchoosey = special.gammaln(N+1) - special.gammaln(y+1) - special.gammaln(N-y+1)
return nchoosey + y*np.log(inv_link_f) + (N-y)*np.log(1.-inv_link_f)
Ny = N-y
t1 = np.zeros(y.shape)
t2 = np.zeros(y.shape)
t1[y>0] = y[y>0]*np.log(inv_link_f[y>0])
t2[Ny>0] = Ny[Ny>0]*np.log(1.-inv_link_f[Ny>0])
return nchoosey + t1 + t2
def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None): def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
""" """
@ -86,7 +93,13 @@ class Binomial(Likelihood):
N = Y_metadata['trials'] N = Y_metadata['trials']
np.testing.assert_array_equal(N.shape, y.shape) np.testing.assert_array_equal(N.shape, y.shape)
return y/inv_link_f - (N-y)/(1.-inv_link_f) Ny = N-y
t1 = np.zeros(y.shape)
t2 = np.zeros(y.shape)
t1[y>0] = y[y>0]/inv_link_f[y>0]
t2[Ny>0] = (Ny[Ny>0])/(1.-inv_link_f[Ny>0])
return t1 - t2
def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None): def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
""" """
@ -111,7 +124,13 @@ class Binomial(Likelihood):
""" """
N = Y_metadata['trials'] N = Y_metadata['trials']
np.testing.assert_array_equal(N.shape, y.shape) np.testing.assert_array_equal(N.shape, y.shape)
return -y/np.square(inv_link_f) - (N-y)/np.square(1.-inv_link_f) Ny = N-y
t1 = np.zeros(y.shape)
t2 = np.zeros(y.shape)
t1[y>0] = -y[y>0]/np.square(inv_link_f[y>0])
t2[Ny>0] = -(Ny[Ny>0])/np.square(1.-inv_link_f[Ny>0])
return t1+t2
def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None): def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
""" """
@ -135,8 +154,14 @@ class Binomial(Likelihood):
N = Y_metadata['trials'] N = Y_metadata['trials']
np.testing.assert_array_equal(N.shape, y.shape) np.testing.assert_array_equal(N.shape, y.shape)
inv_link_f2 = np.square(inv_link_f) #inv_link_f2 = np.square(inv_link_f) #TODO Remove. Why is this here?
return 2*y/inv_link_f**3 - 2*(N-y)/(1.-inv_link_f)**3
Ny = N-y
t1 = np.zeros(y.shape)
t2 = np.zeros(y.shape)
t1[y>0] = 2*y[y>0]/inv_link_f[y>0]**3
t2[Ny>0] = - 2*(Ny[Ny>0])/(1.-inv_link_f[Ny>0])**3
return t1 + t2
def samples(self, gp, Y_metadata=None, **kw): def samples(self, gp, Y_metadata=None, **kw):
""" """

View file

@ -57,7 +57,10 @@ class Gaussian(Likelihood):
def update_gradients(self, grad): def update_gradients(self, grad):
self.variance.gradient = grad self.variance.gradient = grad
def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None): def ep_gradients(self, Y, cav_tau, cav_v, dL_dKdiag, Y_metadata=None, quad_mode='gk', boost_grad=1.):
return self.exact_inference_gradients(dL_dKdiag)
def exact_inference_gradients(self, dL_dKdiag, Y_metadata=None):
return dL_dKdiag.sum() return dL_dKdiag.sum()
def _preprocess_values(self, Y): def _preprocess_values(self, Y):

View file

@ -6,8 +6,12 @@ from scipy import stats,special
import scipy as sp import scipy as sp
from . import link_functions from . import link_functions
from ..util.misc import chain_1, chain_2, chain_3, blockify_dhess_dtheta, blockify_third, blockify_hessian, safe_exp from ..util.misc import chain_1, chain_2, chain_3, blockify_dhess_dtheta, blockify_third, blockify_hessian, safe_exp
from ..util.quad_integrate import quadgk_int
from scipy.integrate import quad from scipy.integrate import quad
from functools import partial
import warnings import warnings
from ..core.parameterization import Parameterized from ..core.parameterization import Parameterized
class Likelihood(Parameterized): class Likelihood(Parameterized):
@ -223,6 +227,91 @@ class Likelihood(Parameterized):
self.__gh_points = np.polynomial.hermite.hermgauss(T) self.__gh_points = np.polynomial.hermite.hermgauss(T)
return self.__gh_points return self.__gh_points
def ep_gradients(self, Y, cav_tau, cav_v, dL_dKdiag, Y_metadata=None, quad_mode='gk', boost_grad=1.):
if self.size > 0:
shape = Y.shape
tau,v,Y = cav_tau.flatten(), cav_v.flatten(),Y.flatten()
mu = v/tau
sigma2 = 1./tau
# assert Y.shape == v.shape
dlik_dtheta = np.empty((self.size, Y.shape[0]))
# for j in range(self.size):
Y_metadata_list = []
for index in range(len(Y)):
Y_metadata_i = {}
if Y_metadata is not None:
for key in Y_metadata.keys():
Y_metadata_i[key] = Y_metadata[key][index,:]
Y_metadata_list.append(Y_metadata_i)
if quad_mode == 'gk':
f = partial(self.integrate_gk)
quads = zip(*map(f, Y.flatten(), mu.flatten(), np.sqrt(sigma2.flatten()), Y_metadata_list))
quads = np.vstack(quads)
quads.reshape(self.size, shape[0], shape[1])
elif quad_mode == 'gh':
f = partial(self.integrate_gh)
quads = zip(*map(f, Y.flatten(), mu.flatten(), np.sqrt(sigma2.flatten())))
quads = np.hstack(quads)
quads = quads.T
else:
raise Exception("no other quadrature mode available")
# do a gaussian-hermite integration
dL_dtheta_avg = boost_grad * np.nanmean(quads, axis=1)
dL_dtheta = boost_grad * np.nansum(quads, axis=1)
# dL_dtheta = boost_grad * np.nansum(dlik_dtheta, axis=1)
else:
dL_dtheta = np.zeros(self.num_params)
return dL_dtheta
def integrate_gk(self, Y, mu, sigma, Y_metadata_i=None):
# gaussian-kronrod integration.
fmin = -np.inf
fmax = np.inf
SQRT_2PI = np.sqrt(2.*np.pi)
def generate_integral(f):
a = np.exp(self.logpdf_link(f, Y, Y_metadata_i)) * np.exp(-0.5 * np.square((f - mu) / sigma)) / (
SQRT_2PI * sigma)
fn1 = a * self.dlogpdf_dtheta(f, Y, Y_metadata_i)
fn = fn1
return fn
dF_dtheta_i = quadgk_int(generate_integral, fmin=fmin, fmax=fmax)
return dF_dtheta_i
def integrate_gh(self, Y, mu, sigma, Y_metadata_i=None, gh_points=None):
# gaussian-hermite quadrature.
# "calculate site derivatives E_f{d logp(y_i|f_i)/da} where a is a likelihood parameter
# and the expectation is over the exact marginal posterior, which is not gaussian- and is
# unnormalised product of the cavity distribution(a Gaussian) and the exact likelihood term.
#
# calculate the expectation wrt the approximate marginal posterior, which should be approximately the same.
# . This term is needed for evaluating the
# gradients of the marginal likelihood estimate Z_EP wrt likelihood parameters."
# "writing it explicitly "
# use them for gaussian-hermite quadrature
SQRT_2PI = np.sqrt(2.*np.pi)
if gh_points is None:
gh_x, gh_w = self._gh_points(32)
else:
gh_x, gh_w = gh_points
X = gh_x[None,:]*np.sqrt(2.)*sigma + mu
# Here X is a grid vector of possible fi values, while Y is just a single value which will be broadcasted.
a = np.exp(self.logpdf_link(X, Y, Y_metadata_i))
a = a.repeat(self.num_params,0)
b = self.dlogpdf_dtheta(X, Y, Y_metadata_i)
old_shape = b.shape
fn = np.array([i*j for i,j in zip(a.flatten(), b.flatten())])
fn = fn.reshape(old_shape)
dF_dtheta_i = np.dot(fn, gh_w)/np.sqrt(np.pi)
return dF_dtheta_i
def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None): def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
""" """
Use Gauss-Hermite Quadrature to compute Use Gauss-Hermite Quadrature to compute

View file

@ -0,0 +1,147 @@
import numpy as np
import unittest
import GPy
from GPy.models import GradientChecker
fixed_seed = 10
from nose.tools import with_setup, nottest
# this file will contain some high level tests, this is not unit testing, but will give us a higher level estimate
# if things are going well under the hood.
class TestObservationModels(unittest.TestCase):
def setUp(self):
np.random.seed(fixed_seed)
self.N = 100
self.D = 2
self.X = np.random.rand(self.N, self.D)
self.real_noise_std = 0.05
noise = np.random.randn(*self.X[:, 0].shape) * self.real_noise_std
self.Y = (np.sin(self.X[:, 0] * 2 * np.pi) + noise)[:, None]
self.num_points = self.X.shape[0]
self.f = np.random.rand(self.N, 1)
self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
# self.binary_Y[self.binary_Y == 0.0] = -1.0
self.positive_Y = np.exp(self.Y.copy())
self.Y_noisy = self.Y.copy()
self.Y_verynoisy = self.Y.copy()
self.Y_noisy[75] += 1.3
self.init_var = 0.15
self.deg_free = 4.
censored = np.zeros_like(self.Y)
random_inds = np.random.choice(self.N, int(self.N / 2), replace=True)
censored[random_inds] = 1
self.Y_metadata = dict()
self.Y_metadata['censored'] = censored
self.kernel1 = GPy.kern.RBF(self.X.shape[1]) + GPy.kern.White(self.X.shape[1])
def tearDown(self):
self.Y = None
self.X = None
self.binary_Y =None
self.positive_Y = None
self.kernel1 = None
@with_setup(setUp, tearDown)
def testEPClassification(self):
bernoulli = GPy.likelihoods.Bernoulli()
laplace_inf = GPy.inference.latent_function_inference.Laplace()
ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode='alternated')
ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode='nested')
ep_inf_fractional = GPy.inference.latent_function_inference.EP(ep_mode='nested', eta=0.9)
m1 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=laplace_inf)
m1.randomize()
m2 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_alt)
m2.randomize()
m3 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_nested)
m3.randomize()
#
m4 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_fractional)
m4.randomize()
optimizer = 'bfgs'
#do gradcheck here ...
# self.assertTrue(m1.checkgrad())
# self.assertTrue(m2.checkgrad())
# self.assertTrue(m3.checkgrad())
# self.assertTrue(m4.checkgrad())
m1.optimize(optimizer=optimizer, max_iters=300)
m2.optimize(optimizer=optimizer, max_iters=300)
m3.optimize(optimizer=optimizer, max_iters=300)
m4.optimize(optimizer=optimizer, max_iters=300)
# taking laplace predictions as the ground truth
probs_mean_lap, probs_var_lap = m1.predict(self.X)
probs_mean_ep_alt, probs_var_ep_alt = m2.predict(self.X)
probs_mean_ep_nested, probs_var_ep_nested = m3.predict(self.X)
# for simple single dimension data , marginal likelihood for laplace and EP approximations should not be so far apart.
self.assertAlmostEqual(m1.log_likelihood(), m2.log_likelihood(),delta=1)
self.assertAlmostEqual(m1.log_likelihood(), m3.log_likelihood(), delta=1)
self.assertAlmostEqual(m1.log_likelihood(), m4.log_likelihood(), delta=5)
GPy.util.classification.conf_matrix(probs_mean_lap, self.binary_Y)
GPy.util.classification.conf_matrix(probs_mean_ep_alt, self.binary_Y)
GPy.util.classification.conf_matrix(probs_mean_ep_nested, self.binary_Y)
@nottest
def rmse(self, Y, Ystar):
return np.sqrt(np.mean((Y - Ystar) ** 2))
@with_setup(setUp, tearDown)
def test_EP_with_StudentT(self):
studentT = GPy.likelihoods.StudentT(deg_free=self.deg_free, sigma2=self.init_var)
laplace_inf = GPy.inference.latent_function_inference.Laplace()
ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode='alternated')
ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode='nested')
ep_inf_frac = GPy.inference.latent_function_inference.EP(ep_mode='nested', eta=0.7)
m1 = GPy.core.GP(self.X.copy(), self.Y_noisy.copy(), kernel=self.kernel1.copy(), likelihood=studentT.copy(), inference_method=laplace_inf)
# optimize
m1['.*white'].constrain_fixed(1e-5)
m1.randomize()
m2 = GPy.core.GP(self.X.copy(), self.Y_noisy.copy(), kernel=self.kernel1.copy(), likelihood=studentT.copy(), inference_method=ep_inf_alt)
m2['.*white'].constrain_fixed(1e-5)
# m2.constrain_bounded('.*t_scale2', 0.001, 10)
m2.randomize()
# m3 = GPy.core.GP(self.X, self.Y_noisy.copy(), kernel=self.kernel1, likelihood=studentT.copy(), inference_method=ep_inf_nested)
# m3['.*white'].constrain_fixed(1e-5)
# # m3.constrain_bounded('.*t_scale2', 0.001, 10)
# m3.randomize()
optimizer='bfgs'
m1.optimize(optimizer=optimizer,max_iters=400)
m2.optimize(optimizer=optimizer, max_iters=400)
# m3.optimize(optimizer=optimizer, max_iters=500)
self.assertAlmostEqual(m1.log_likelihood(), m2.log_likelihood(),delta=200)
# self.assertAlmostEqual(m1.log_likelihood(), m3.log_likelihood(), 3)
preds_mean_lap, preds_var_lap = m1.predict(self.X)
preds_mean_alt, preds_var_alt = m2.predict(self.X)
# preds_mean_nested, preds_var_nested = m3.predict(self.X)
rmse_lap = self.rmse(preds_mean_lap, self.Y)
rmse_alt = self.rmse(preds_mean_alt, self.Y)
# rmse_nested = self.rmse(preds_mean_nested, self.Y_noisy)
if rmse_alt > rmse_lap:
self.assertAlmostEqual(rmse_lap, rmse_alt, delta=1.5)
# m3.optimize(optimizer=optimizer, max_iters=500)
if __name__ == "__main__":
unittest.main()

View file

@ -61,6 +61,18 @@ class InferenceGPEP(unittest.TestCase):
Y = lik.samples(f).reshape(-1,1) Y = lik.samples(f).reshape(-1,1)
return X, Y return X, Y
def genNoisyData(self):
np.random.seed(1)
X = np.random.rand(100,1)
self.real_std = 0.1
noise = np.random.randn(*X[:, 0].shape)*self.real_std
Y = (np.sin(X[:, 0]*2*np.pi) + noise)[:, None]
self.f = np.random.rand(X.shape[0],1)
Y_extra_noisy = Y.copy()
Y_extra_noisy[50] += 4.
# Y_extra_noisy[80:83] -= 2.
return X, Y, Y_extra_noisy
def test_inference_EP(self): def test_inference_EP(self):
from paramz import ObsAr from paramz import ObsAr
X, Y = self.genData() X, Y = self.genData()
@ -73,11 +85,45 @@ class InferenceGPEP(unittest.TestCase):
inference_method=inf, inference_method=inf,
likelihood=lik) likelihood=lik)
K = self.model.kern.K(X) K = self.model.kern.K(X)
mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self.model.inference_method.expectation_propagation(K, ObsAr(Y), lik, None)
v_tilde = mu_tilde * tau_tilde post_params, ga_approx, cav_params, log_Z_tilde = self.model.inference_method.expectation_propagation(K, ObsAr(Y), lik, None)
p, m, d = self.model.inference_method._inference(K, tau_tilde, v_tilde, lik, Y_metadata=None, Z_tilde=log_Z_tilde.sum())
p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, inf).inference(k, X,lik ,mu_tilde[:,None], mean_function=None, variance=1./tau_tilde, K=K, Z_tilde=log_Z_tilde.sum() + np.sum(- 0.5*np.log(tau_tilde) + 0.5*(v_tilde*v_tilde*1./tau_tilde))) mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
p, m, d = self.model.inference_method._inference(Y, K, ga_approx, cav_params, lik, Y_metadata=None, Z_tilde=log_Z_tilde)
p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, inf).inference(k, X,lik ,mu_tilde[:,None], mean_function=None, variance=1./ga_approx.tau, K=K, Z_tilde=log_Z_tilde + np.sum(- 0.5*np.log(ga_approx.tau) + 0.5*(ga_approx.v*ga_approx.v*1./ga_approx.tau)))
assert (np.sum(np.array([m - m0,
np.sum(d['dL_dK'] - d0['dL_dK']),
np.sum(d['dL_dthetaL'] - d0['dL_dthetaL']),
np.sum(d['dL_dm'] - d0['dL_dm']),
np.sum(p._woodbury_vector - p0._woodbury_vector),
np.sum(p.woodbury_inv - p0.woodbury_inv)])) < 1e6)
# NOTE: adding a test like above for parameterized likelihood- the above test is
# only for probit likelihood which does not have any tunable hyperparameter which is why
# the term in dictionary of gradients: dL_dthetaL will always be zero. So here we repeat tests for
# student-t likelihood and heterodescastic gaussian noise case. This test simply checks if the posterior
# and gradients of log marginal are roughly the same for inference through EP and exact gaussian inference using
# the gaussian approximation for the individual likelihood site terms. For probit likelihood, it is possible to
# calculate moments analytically, but for other likelihoods, we will need to use numerical quadrature techniques,
# and it is possible that any error might creep up because of quadrature implementation.
def test_inference_EP_non_classification(self):
from paramz import ObsAr
X, Y, Y_extra_noisy = self.genNoisyData()
deg_freedom = 5.
init_noise_var = 0.08
lik_studentT = GPy.likelihoods.StudentT(deg_free=deg_freedom, sigma2=init_noise_var)
# like_gaussian_noise = GPy.likelihoods.MixedNoise()
k = GPy.kern.RBF(1, variance=2., lengthscale=1.1)
ep_inf_alt = GPy.inference.latent_function_inference.expectation_propagation.EP(max_iters=4, delta=0.5)
# ep_inf_nested = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode='nested', max_iters=100, delta=0.5)
m = GPy.core.GP(X=X,Y=Y_extra_noisy,kernel=k,likelihood=lik_studentT,inference_method=ep_inf_alt)
K = m.kern.K(X)
post_params, ga_approx, cav_params, log_Z_tilde = m.inference_method.expectation_propagation(K, ObsAr(Y_extra_noisy), lik_studentT, None)
mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
p, m, d = m.inference_method._inference(Y_extra_noisy, K, ga_approx, cav_params, lik_studentT, Y_metadata=None, Z_tilde=log_Z_tilde)
p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, ep_inf_alt).inference(k, X,lik_studentT ,mu_tilde[:,None], mean_function=None, variance=1./ga_approx.tau, K=K, Z_tilde=log_Z_tilde + np.sum(- 0.5*np.log(ga_approx.tau) + 0.5*(ga_approx.v*ga_approx.v*1./ga_approx.tau)))
assert (np.sum(np.array([m - m0, assert (np.sum(np.array([m - m0,
np.sum(d['dL_dK'] - d0['dL_dK']), np.sum(d['dL_dK'] - d0['dL_dK']),

View file

@ -0,0 +1,39 @@
from __future__ import print_function, division
import numpy as np
import GPy
import warnings
from ..util.quad_integrate import quadgk_int, quadvgk
class QuadTests(np.testing.TestCase):
"""
test file for checking implementation of gaussian-kronrod quadrature.
we will take a function which can be integrated analytically and check if quadgk result is similar or not!
through this file we can test how numerically accurate quadrature implementation in native numpy or manual code is.
"""
def setUp(self):
pass
def test_infinite_quad(self):
def f(x):
return np.exp(-0.5*x**2)*np.power(x,np.arange(3)[:,None])
quad_int_val = quadgk_int(f)
real_val = np.sqrt(np.pi * 2)
np.testing.assert_almost_equal(real_val, quad_int_val[0], decimal=7)
def test_finite_quad(self):
def f2(x):
return x**2
quad_int_val = quadvgk(f2, 1.,2.)
real_val = 7/3.
np.testing.assert_almost_equal(real_val, quad_int_val, decimal=5)
if __name__ == '__main__':
def f(x):
return np.exp(-0.5 * x ** 2) * np.power(x, np.arange(3)[:, None])
quad_int_val = quadgk_int(f)
real_val = np.sqrt(np.pi*2)
np.testing.assert_almost_equal(real_val, quad_int_val[0], decimal=7)
print(quadgk_int(f))

View file

@ -17,3 +17,4 @@ from . import multioutput
from . import parallel from . import parallel
from . import functions from . import functions
from . import cluster_with_offset from . import cluster_with_offset
from . import quad_integrate

119
GPy/util/quad_integrate.py Normal file
View file

@ -0,0 +1,119 @@
"""
The file for utilities related to integration by quadrature methods
- will contain implementation for gaussian-kronrod integration.
"""
import numpy as np
def getSubs(Subs, XK, NK=1):
M = (Subs[1, :] - Subs[0, :]) / 2
C = (Subs[1, :] + Subs[0, :]) / 2
I = XK[:, None] * M + np.ones((NK, 1)) * C
# A = [Subs(1,:); I]
A = np.vstack((Subs[0, :], I))
# B = [I;Subs(2,:)]
B = np.vstack((I, Subs[1, :]))
# Subs = [reshape(A, 1, []);
A = A.flatten()
# reshape(B, 1, [])];
B = B.flatten()
Subs = np.vstack((A,B))
# Subs = np.concatenate((A, B), axis=0)
return Subs
def quadvgk(feval, fmin, fmax, tol1=1e-5, tol2=1e-5):
"""
numpy implementation makes use of the code here: http://se.mathworks.com/matlabcentral/fileexchange/18801-quadvgk
We here use gaussian kronrod integration already used in gpstuff for evaluating one dimensional integrals.
This is vectorised quadrature which means that several functions can be evaluated at the same time over a grid of
points.
:param f:
:param fmin:
:param fmax:
:param difftol:
:return:
"""
XK = np.array([-0.991455371120813, -0.949107912342759, -0.864864423359769, -0.741531185599394,
-0.586087235467691, -0.405845151377397, -0.207784955007898, 0.,
0.207784955007898, 0.405845151377397, 0.586087235467691,
0.741531185599394, 0.864864423359769, 0.949107912342759, 0.991455371120813])
WK = np.array([0.022935322010529, 0.063092092629979, 0.104790010322250, 0.140653259715525,
0.169004726639267, 0.190350578064785, 0.204432940075298, 0.209482141084728,
0.204432940075298, 0.190350578064785, 0.169004726639267,
0.140653259715525, 0.104790010322250, 0.063092092629979, 0.022935322010529])
# 7-point Gaussian weightings
WG = np.array([0.129484966168870, 0.279705391489277, 0.381830050505119, 0.417959183673469,
0.381830050505119, 0.279705391489277, 0.129484966168870])
NK = WK.size
G = np.arange(2,NK,2)
tol1 = 1e-4
tol2 = 1e-4
Subs = np.array([[fmin],[fmax]])
# number of functions to evaluate in the feval vector of functions.
NF = feval(np.zeros(1)).size
Q = np.zeros(NF)
neval = 0
while Subs.size > 0:
Subs = getSubs(Subs,XK)
M = (Subs[1,:] - Subs[0,:]) / 2
C = (Subs[1,:] + Subs[0,:]) / 2
# NM = length(M);
NM = M.size
# x = reshape(XK * M + ones(NK, 1) * C, 1, []);
x = XK[:,None]*M + C
x = x.flatten()
FV = feval(x)
# FV = FV[:,None]
Q1 = np.zeros((NF, NM))
Q2 = np.zeros((NF, NM))
# for n=1:NF
# F = reshape(FV(n,:), NK, []);
# Q1(n,:) = M. * sum((WK * ones(1, NM)). * F);
# Q2(n,:) = M. * sum((WG * ones(1, NM)). * F(G,:));
# end
# for i in range(NF):
# F = FV
# F = F.reshape((NK,-1))
# temp_mat = np.sum(np.multiply(WK[:,None]*np.ones((1,NM)), F),axis=0)
# Q1[i,:] = np.multiply(M, temp_mat)
# temp_mat = np.sum(np.multiply(WG[:,None]*np.ones((1, NM)), F[G-1,:]), axis=0)
# Q2[i,:] = np.multiply(M, temp_mat)
# ind = np.where(np.logical_or(np.max(np.abs(Q1 -Q2) / Q1) < tol1, (Subs[1,:] - Subs[0,:]) <= tol2) > 0)[0]
# Q = Q + np.sum(Q1[:,ind], axis=1)
# np.delete(Subs, ind,axis=1)
Q1 = np.dot(FV.reshape(NF, NK, NM).swapaxes(2,1),WK)*M
Q2 = np.dot(FV.reshape(NF, NK, NM).swapaxes(2,1)[:,:,1::2],WG)*M
#ind = np.nonzero(np.logical_or(np.max(np.abs((Q1-Q2)/Q1), 0) < difftol , M < xtol))[0]
ind = np.nonzero(np.logical_or(np.max(np.abs((Q1-Q2)), 0) < tol1 , (Subs[1,:] - Subs[0,:]) < tol2))[0]
Q = Q + np.sum(Q1[:,ind], axis=1)
Subs = np.delete(Subs, ind, axis=1)
return Q
def quadgk_int(f, fmin=-np.inf, fmax=np.inf, difftol=0.1):
"""
Integrate f from fmin to fmax,
do integration by substitution
x = r / (1-r**2)
when r goes from -1 to 1 , x goes from -inf to inf.
the interval for quadgk function is from -1 to +1, so we transform the space from (-inf,inf) to (-1,1)
:param f:
:param fmin:
:param fmax:
:param difftol:
:return:
"""
difftol = 1e-4
def trans_func(r):
r2 = np.square(r)
x = r / (1-r2)
dx_dr = (1 + r2)/(1-r2)**2
return f(x)*dx_dr
integrand = quadvgk(trans_func, -1., 1., difftol, difftol)
return integrand