mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-04-27 05:46:24 +02:00
Fixed quadrature for bernoulli likelihood, started adding Gaussian likelihood derivatives for quadrature
This commit is contained in:
parent
2de2217473
commit
935f2016db
5 changed files with 23 additions and 35 deletions
|
|
@ -56,6 +56,7 @@ class SVGP(SparseGP):
|
||||||
self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z) + self.kern.gradients_X(self.grad_dict['dL_dKmn'], self.Z, self.X)
|
self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z) + self.kern.gradients_X(self.grad_dict['dL_dKmn'], self.Z, self.X)
|
||||||
|
|
||||||
|
|
||||||
|
self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
|
||||||
#update the variational parameter gradients:
|
#update the variational parameter gradients:
|
||||||
self.m.gradient = self.grad_dict['dL_dm']
|
self.m.gradient = self.grad_dict['dL_dm']
|
||||||
self.chol.gradient = self.grad_dict['dL_dchol']
|
self.chol.gradient = self.grad_dict['dL_dchol']
|
||||||
|
|
|
||||||
|
|
@ -5,27 +5,9 @@ import numpy as np
|
||||||
from posterior import Posterior
|
from posterior import Posterior
|
||||||
|
|
||||||
class SVGP(LatentFunctionInference):
|
class SVGP(LatentFunctionInference):
|
||||||
def likelihood_quadrature(self, Y, m, v):
|
|
||||||
Ysign = np.where(Y==1,1,-1).flatten()
|
|
||||||
from scipy import stats
|
|
||||||
self.gh_x, self.gh_w = np.polynomial.hermite.hermgauss(20)
|
|
||||||
|
|
||||||
#assume probit for now.
|
|
||||||
X = self.gh_x[None,:]*np.sqrt(2.*v[:,None]) + (m*Ysign)[:,None]
|
|
||||||
p = stats.norm.cdf(X)
|
|
||||||
p = np.clip(p, 1e-9, 1.-1e-9) # for numerical stability
|
|
||||||
N = stats.norm.pdf(X)
|
|
||||||
F = np.log(p).dot(self.gh_w)
|
|
||||||
NoverP = N/p
|
|
||||||
dF_dm = (NoverP*Ysign[:,None]).dot(self.gh_w)
|
|
||||||
dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(self.gh_w)
|
|
||||||
return F, dF_dm, dF_dv
|
|
||||||
|
|
||||||
|
|
||||||
def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None):
|
def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None):
|
||||||
assert Y.shape[1]==1, "multi outputs not implemented"
|
assert Y.shape[1]==1, "multi outputs not implemented"
|
||||||
|
|
||||||
|
|
||||||
num_inducing = Z.shape[0]
|
num_inducing = Z.shape[0]
|
||||||
#expand cholesky representation
|
#expand cholesky representation
|
||||||
L = choleskies.flat_to_triang(q_u_chol[:,None]).squeeze()
|
L = choleskies.flat_to_triang(q_u_chol[:,None]).squeeze()
|
||||||
|
|
@ -57,9 +39,7 @@ class SVGP(LatentFunctionInference):
|
||||||
dKL_dKmm = 0.5*Kmmi - 0.5*Kmmi.dot(S).dot(Kmmi) - 0.5*Kmmim[:,None]*Kmmim[None,:]
|
dKL_dKmm = 0.5*Kmmi - 0.5*Kmmi.dot(S).dot(Kmmi) - 0.5*Kmmim[:,None]*Kmmim[None,:]
|
||||||
|
|
||||||
#quadrature for the likelihood
|
#quadrature for the likelihood
|
||||||
#F, dF_dmu, dF_dv = likelihood.variational_expectations(Y, mu, v)
|
F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v)
|
||||||
F, dF_dmu, dF_dv = self.likelihood_quadrature(Y, mu, v)
|
|
||||||
|
|
||||||
|
|
||||||
#rescale the F term if working on a batch
|
#rescale the F term if working on a batch
|
||||||
#F, dF_dmu, dF_dv = F*batch_scale, dF_dmu*batch_scale, dF_dv*batch_scale
|
#F, dF_dmu, dF_dv = F*batch_scale, dF_dmu*batch_scale, dF_dv*batch_scale
|
||||||
|
|
@ -82,7 +62,7 @@ class SVGP(LatentFunctionInference):
|
||||||
dL_dchol = 2.*np.dot(dL_dS, L)
|
dL_dchol = 2.*np.dot(dL_dS, L)
|
||||||
dL_dchol = choleskies.triang_to_flat(dL_dchol[:,:,None]).squeeze()
|
dL_dchol = choleskies.triang_to_flat(dL_dchol[:,:,None]).squeeze()
|
||||||
|
|
||||||
return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol}
|
return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -133,7 +133,7 @@ class Bernoulli(Likelihood):
|
||||||
"""
|
"""
|
||||||
#objective = y*np.log(inv_link_f) + (1.-y)*np.log(inv_link_f)
|
#objective = y*np.log(inv_link_f) + (1.-y)*np.log(inv_link_f)
|
||||||
p = np.where(y==1, inv_link_f, 1.-inv_link_f)
|
p = np.where(y==1, inv_link_f, 1.-inv_link_f)
|
||||||
return np.log(np.clip(p, 1e-6 ,np.inf))
|
return np.log(np.clip(p, 1e-9 ,np.inf))
|
||||||
|
|
||||||
def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
|
def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
|
||||||
"""
|
"""
|
||||||
|
|
@ -152,7 +152,7 @@ class Bernoulli(Likelihood):
|
||||||
"""
|
"""
|
||||||
#grad = (y/inv_link_f) - (1.-y)/(1-inv_link_f)
|
#grad = (y/inv_link_f) - (1.-y)/(1-inv_link_f)
|
||||||
#grad = np.where(y, 1./inv_link_f, -1./(1-inv_link_f))
|
#grad = np.where(y, 1./inv_link_f, -1./(1-inv_link_f))
|
||||||
ff = np.clip(inv_link_f, 1e-6, 1-1e-6)
|
ff = np.clip(inv_link_f, 1e-9, 1-1e-9)
|
||||||
denom = np.where(y, ff, -(1-ff))
|
denom = np.where(y, ff, -(1-ff))
|
||||||
return 1./denom
|
return 1./denom
|
||||||
|
|
||||||
|
|
@ -180,7 +180,7 @@ class Bernoulli(Likelihood):
|
||||||
#d2logpdf_dlink2 = -y/(inv_link_f**2) - (1-y)/((1-inv_link_f)**2)
|
#d2logpdf_dlink2 = -y/(inv_link_f**2) - (1-y)/((1-inv_link_f)**2)
|
||||||
#d2logpdf_dlink2 = np.where(y, -1./np.square(inv_link_f), -1./np.square(1.-inv_link_f))
|
#d2logpdf_dlink2 = np.where(y, -1./np.square(inv_link_f), -1./np.square(1.-inv_link_f))
|
||||||
arg = np.where(y, inv_link_f, 1.-inv_link_f)
|
arg = np.where(y, inv_link_f, 1.-inv_link_f)
|
||||||
ret = -1./np.square(np.clip(arg, 1e-3, np.inf))
|
ret = -1./np.square(np.clip(arg, 1e-9, 1e9))
|
||||||
if np.any(np.isinf(ret)):
|
if np.any(np.isinf(ret)):
|
||||||
stop
|
stop
|
||||||
return ret
|
return ret
|
||||||
|
|
|
||||||
|
|
@ -316,3 +316,11 @@ class Gaussian(Likelihood):
|
||||||
v = var_star + self.variance
|
v = var_star + self.variance
|
||||||
return -0.5*np.log(2*np.pi) -0.5*np.log(v) - 0.5*np.square(y_test - mu_star)/v
|
return -0.5*np.log(2*np.pi) -0.5*np.log(v) - 0.5*np.square(y_test - mu_star)/v
|
||||||
|
|
||||||
|
def variational_expectations(self, Y, m, v, gh_points=None):
|
||||||
|
lik_var = float(self.variance)
|
||||||
|
F = -0.5*np.log(2*np.pi) -0.5*np.log(lik_var) - 0.5*(np.square(Y) + np.square(m) + v - 2*m.dot(Y))/lik_var
|
||||||
|
dF_dmu = (Y - m)/lik_var
|
||||||
|
dF_dv = -0.5/lik_var
|
||||||
|
dF_dlik_var = -0.5/lik_var + 0.5(np.square(Y) + np.square(m) + v - 2*m.dot(Y))/(lik_var**2)
|
||||||
|
dF_dtheta = [dF_dlik_var]
|
||||||
|
return F, dF_dmu, dF_dv, dF_dtheta
|
||||||
|
|
|
||||||
|
|
@ -133,7 +133,7 @@ class Likelihood(Parameterized):
|
||||||
|
|
||||||
def variational_expectations(self, Y, m, v, gh_points=None):
|
def variational_expectations(self, Y, m, v, gh_points=None):
|
||||||
"""
|
"""
|
||||||
Use Gauss-Hermite Quadrature to compute
|
Use Gauss-Hermite Quadrature to compute
|
||||||
|
|
||||||
E_p(f) [ log p(y|f) ]
|
E_p(f) [ log p(y|f) ]
|
||||||
d/dm E_p(f) [ log p(y|f) ]
|
d/dm E_p(f) [ log p(y|f) ]
|
||||||
|
|
@ -143,9 +143,10 @@ class Likelihood(Parameterized):
|
||||||
|
|
||||||
if no gh_points are passed, we construct them using defualt options
|
if no gh_points are passed, we construct them using defualt options
|
||||||
"""
|
"""
|
||||||
|
#May be broken
|
||||||
|
|
||||||
if gh_points is None:
|
if gh_points is None:
|
||||||
gh_x, gh_w = np.polynomial.hermite.hermgauss(12)
|
gh_x, gh_w = np.polynomial.hermite.hermgauss(20)
|
||||||
else:
|
else:
|
||||||
gh_x, gh_w = gh_points
|
gh_x, gh_w = gh_points
|
||||||
|
|
||||||
|
|
@ -156,15 +157,15 @@ class Likelihood(Parameterized):
|
||||||
X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + m[:,None]
|
X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + m[:,None]
|
||||||
|
|
||||||
#evaluate the likelhood for the grid. First ax indexes the data (and mu, var) and the second indexes the grid.
|
#evaluate the likelhood for the grid. First ax indexes the data (and mu, var) and the second indexes the grid.
|
||||||
# broadcast needs to be handled carefully.
|
# broadcast needs to be handled carefully.
|
||||||
logp = self.logpdf(X,Y[:,None])
|
logp = self.logpdf(X,Y[:,None])
|
||||||
dlogp_dx = self.dlogpdf_df(X, Y[:,None])
|
dlogp_dx = self.dlogpdf_df(X, Y[:,None])
|
||||||
d2logp_dx2 = self.d2logpdf_df2(X, Y[:,None])
|
d2logp_dx2 = self.d2logpdf_df2(X, Y[:,None])
|
||||||
|
|
||||||
#clipping for numerical stability
|
#clipping for numerical stability
|
||||||
logp = np.clip(logp,-1e6,1e6)
|
#logp = np.clip(logp,-1e9,1e9)
|
||||||
dlogp_dx = np.clip(dlogp_dx,-1e6,1e6)
|
#dlogp_dx = np.clip(dlogp_dx,-1e9,1e9)
|
||||||
d2logp_dx2 = np.clip(d2logp_dx2,-1e6,1e6)
|
#d2logp_dx2 = np.clip(d2logp_dx2,-1e9,1e9)
|
||||||
|
|
||||||
#average over the gird to get derivatives of the Gaussian's parameters
|
#average over the gird to get derivatives of the Gaussian's parameters
|
||||||
F = np.dot(logp, gh_w)
|
F = np.dot(logp, gh_w)
|
||||||
|
|
@ -176,10 +177,8 @@ class Likelihood(Parameterized):
|
||||||
if np.any(np.isnan(dF_dm)) or np.any(np.isinf(dF_dm)):
|
if np.any(np.isnan(dF_dm)) or np.any(np.isinf(dF_dm)):
|
||||||
stop
|
stop
|
||||||
|
|
||||||
return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape)
|
dF_dtheta = None # Not yet implemented
|
||||||
|
return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def predictive_mean(self, mu, variance, Y_metadata=None):
|
def predictive_mean(self, mu, variance, Y_metadata=None):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue