mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-27 14:25:16 +02:00
Finished tearing gaussian noise down, time for student t
This commit is contained in:
parent
77bca55470
commit
76debef6b8
5 changed files with 208 additions and 191 deletions
|
|
@ -76,7 +76,7 @@ class Laplace(likelihood):
|
||||||
return self.noise_model._set_params(p)
|
return self.noise_model._set_params(p)
|
||||||
|
|
||||||
def _shared_gradients_components(self):
|
def _shared_gradients_components(self):
|
||||||
d3lik_d3fhat = self.noise_model.d3lik_d3f(self.data, self.f_hat, extra_data=self.extra_data)
|
d3lik_d3fhat = -self.noise_model._d3nlog_mass_dgp3(self.f_hat, self.data, extra_data=self.extra_data)
|
||||||
dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
|
dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
|
||||||
I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
|
I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
|
||||||
return dL_dfhat, I_KW_i
|
return dL_dfhat, I_KW_i
|
||||||
|
|
@ -89,7 +89,7 @@ class Laplace(likelihood):
|
||||||
:rtype: Matrix (1 x num_kernel_params)
|
:rtype: Matrix (1 x num_kernel_params)
|
||||||
"""
|
"""
|
||||||
dL_dfhat, I_KW_i = self._shared_gradients_components()
|
dL_dfhat, I_KW_i = self._shared_gradients_components()
|
||||||
dlp = self.noise_model.dlik_df(self.data, self.f_hat)
|
dlp = -self.noise_model._dnlog_mass_dgp(self.data, self.f_hat)
|
||||||
|
|
||||||
#Explicit
|
#Explicit
|
||||||
#expl_a = np.dot(self.Ki_f, self.Ki_f.T)
|
#expl_a = np.dot(self.Ki_f, self.Ki_f.T)
|
||||||
|
|
@ -178,7 +178,7 @@ class Laplace(likelihood):
|
||||||
|
|
||||||
self.Wi_K_i = self.W12BiW12
|
self.Wi_K_i = self.W12BiW12
|
||||||
self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
|
self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
|
||||||
self.lik = self.noise_model.lik_function(self.data, self.f_hat, extra_data=self.extra_data)
|
self.lik = -self.noise_model._nlog_mass(self.f_hat, self.data, extra_data=self.extra_data)
|
||||||
self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
|
self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
|
||||||
|
|
||||||
Z_tilde = (+ self.lik
|
Z_tilde = (+ self.lik
|
||||||
|
|
@ -237,7 +237,7 @@ class Laplace(likelihood):
|
||||||
Rasmussen suggests the use of a numerically stable positive definite matrix B
|
Rasmussen suggests the use of a numerically stable positive definite matrix B
|
||||||
Which has a positive diagonal element and can be easyily inverted
|
Which has a positive diagonal element and can be easyily inverted
|
||||||
|
|
||||||
:param K: Prior covariance matrix evaluated at locations X
|
:param K: Prior Covariance matrix evaluated at locations X
|
||||||
:type K: NxN matrix
|
:type K: NxN matrix
|
||||||
:param W: Negative hessian at a point (diagonal matrix)
|
:param W: Negative hessian at a point (diagonal matrix)
|
||||||
:type W: Vector of diagonal values of hessian (1xN)
|
:type W: Vector of diagonal values of hessian (1xN)
|
||||||
|
|
@ -290,7 +290,7 @@ class Laplace(likelihood):
|
||||||
old_obj = np.inf
|
old_obj = np.inf
|
||||||
|
|
||||||
def obj(Ki_f, f):
|
def obj(Ki_f, f):
|
||||||
return -0.5*np.dot(Ki_f.T, f) + self.noise_model.lik_function(self.data, f, extra_data=self.extra_data)
|
return -0.5*np.dot(Ki_f.T, f) - self.noise_model._nlog_mass(f, self.data, extra_data=self.extra_data)
|
||||||
|
|
||||||
difference = np.inf
|
difference = np.inf
|
||||||
epsilon = 1e-6
|
epsilon = 1e-6
|
||||||
|
|
@ -302,7 +302,7 @@ class Laplace(likelihood):
|
||||||
W = -self.noise_model.d2lik_d2f(self.data, f, extra_data=self.extra_data)
|
W = -self.noise_model.d2lik_d2f(self.data, f, extra_data=self.extra_data)
|
||||||
|
|
||||||
W_f = W*f
|
W_f = W*f
|
||||||
grad = self.noise_model.dlik_df(self.data, f, extra_data=self.extra_data)
|
grad = -self.noise_model._dnlog_mass_dgp(f, self.data, extra_data=self.extra_data)
|
||||||
|
|
||||||
b = W_f + grad
|
b = W_f + grad
|
||||||
W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b))
|
W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b))
|
||||||
|
|
|
||||||
|
|
@ -38,9 +38,9 @@ class Gaussian(NoiseDistribution):
|
||||||
|
|
||||||
def _laplace_gradients(self, y, f, extra_data=None):
|
def _laplace_gradients(self, y, f, extra_data=None):
|
||||||
#must be listed in same order as 'get_param_names'
|
#must be listed in same order as 'get_param_names'
|
||||||
derivs = ([self.dlik_dvar(y, f, extra_data=extra_data)],
|
derivs = ([-self._dnlog_mass_dvar(f, y, extra_data=extra_data)],
|
||||||
[self.dlik_df_dvar(y, f, extra_data=extra_data)],
|
[-self._dnlog_mass_dgp_dvar(f, y, extra_data=extra_data)],
|
||||||
[self.d2lik_d2f_dvar(y, f, extra_data=extra_data)]
|
[-self._d2nlog_mass_dgp2_dvar(f, y, extra_data=extra_data)]
|
||||||
) # lists as we might learn many parameters
|
) # lists as we might learn many parameters
|
||||||
# ensure we have gradients for every parameter we want to optimize
|
# ensure we have gradients for every parameter we want to optimize
|
||||||
assert len(derivs[0]) == len(self._get_param_names())
|
assert len(derivs[0]) == len(self._get_param_names())
|
||||||
|
|
@ -80,22 +80,23 @@ class Gaussian(NoiseDistribution):
|
||||||
def _predictive_variance_analytical(self,mu,sigma,predictive_mean=None):
|
def _predictive_variance_analytical(self,mu,sigma,predictive_mean=None):
|
||||||
return 1./(1./self.variance + 1./sigma**2)
|
return 1./(1./self.variance + 1./sigma**2)
|
||||||
|
|
||||||
def _mass(self,gp,obs):
|
def _mass(self, gp, obs):
|
||||||
#return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
|
#return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
|
||||||
#Assumes no covariance, exp, sum, log for numerical stability
|
#Assumes no covariance, exp, sum, log for numerical stability
|
||||||
return np.exp(np.sum(np.log(stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance)))))
|
return np.exp(np.sum(np.log(stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance)))))
|
||||||
|
|
||||||
def _nlog_mass(self,gp,obs, extra_data=None):
|
def _nlog_mass(self, gp, obs, extra_data=None):
|
||||||
"""
|
"""
|
||||||
Negative Log likelihood function
|
Negative Log likelihood function
|
||||||
|
Chained with link function deriative
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
\\-ln p(y_{i}|f_{i}) = +\\frac{D \\ln 2\\pi}{2} + \\frac{\\ln |K|}{2} + \\frac{(y_{i} - f_{i})^{T}\\sigma^{-2}(y_{i} - f_{i})}{2}
|
\\-ln p(y_{i}|\\lambda(f_{i})) = +\\frac{D \\ln 2\\pi}{2} + \\frac{\\ln |K|}{2} + \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
|
||||||
|
|
||||||
:param y: data
|
:param gp: latent variables (f)
|
||||||
:type y: Nx1 array
|
:type gp: Nx1 array
|
||||||
:param f: latent variables f
|
:param obs: data (y)
|
||||||
:type f: Nx1 array
|
:type obs: Nx1 array
|
||||||
:param extra_data: extra_data which is not used in student t distribution - not used
|
:param extra_data: extra_data which is not used in student t distribution - not used
|
||||||
:returns: likelihood evaluated for this point
|
:returns: likelihood evaluated for this point
|
||||||
:rtype: float
|
:rtype: float
|
||||||
|
|
@ -103,12 +104,133 @@ class Gaussian(NoiseDistribution):
|
||||||
assert gp.shape == obs.shape
|
assert gp.shape == obs.shape
|
||||||
return .5*(np.sum((self.gp_link.transf(gp)-obs)**2/self.variance) + self.ln_det_K + self.N*np.log(2.*np.pi))
|
return .5*(np.sum((self.gp_link.transf(gp)-obs)**2/self.variance) + self.ln_det_K + self.N*np.log(2.*np.pi))
|
||||||
|
|
||||||
def _dnlog_mass_dgp(self,gp,obs):
|
def _dnlog_mass_dgp(self, gp, obs, extra_data=None):
|
||||||
|
"""
|
||||||
|
Negative Gradient of the link function at y, given f w.r.t f
|
||||||
|
Chained with link function deriative
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
\\frac{d \\ln p(y_{i}|f_{i})}{df} = \\frac{1}{\\sigma^{2}}(y_{i} - f_{i})
|
||||||
|
\\frac{d \\-ln p(y_{i}|f_{i})}{df} = -\\frac{1}{\\sigma^{2}}(y_{i} - \\lambda(f_{i}))\\frac{d\\lambda(f_{i})}{df_{i}}
|
||||||
|
|
||||||
|
:param gp: latent variables (f)
|
||||||
|
:type gp: Nx1 array
|
||||||
|
:param obs: data (y)
|
||||||
|
:type obs: Nx1 array
|
||||||
|
:param extra_data: extra_data which is not used in student t distribution - not used
|
||||||
|
:returns: gradient of negative likelihood evaluated at points
|
||||||
|
:rtype: Nx1 array
|
||||||
|
"""
|
||||||
|
assert gp.shape == obs.shape
|
||||||
return (self.gp_link.transf(gp)-obs)/self.variance * self.gp_link.dtransf_df(gp)
|
return (self.gp_link.transf(gp)-obs)/self.variance * self.gp_link.dtransf_df(gp)
|
||||||
|
|
||||||
def _d2nlog_mass_dgp2(self,gp,obs):
|
def _d2nlog_mass_dgp2(self, gp, obs, extra_data=None):
|
||||||
|
"""
|
||||||
|
Negative Hessian at y, given f, w.r.t f the hessian will be 0 unless i == j
|
||||||
|
i.e. second derivative _nlog_mass at y given f_{i} f_{j} w.r.t f_{i} and f_{j}
|
||||||
|
Chained with link function deriative
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
\\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f} = -\\frac{1}{\\sigma^{2}}
|
||||||
|
|
||||||
|
:param gp: latent variables (f)
|
||||||
|
:type gp: Nx1 array
|
||||||
|
:param obs: data (y)
|
||||||
|
:type obs: Nx1 array
|
||||||
|
:param extra_data: extra_data which is not used in student t distribution - not used
|
||||||
|
:returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
|
||||||
|
:rtype: Nx1 array
|
||||||
|
|
||||||
|
.. Note::
|
||||||
|
Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
|
||||||
|
(the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
|
||||||
|
"""
|
||||||
|
assert gp.shape == obs.shape
|
||||||
|
#FIXME: Why squared?
|
||||||
return ((self.gp_link.transf(gp)-obs)*self.gp_link.d2transf_df2(gp) + self.gp_link.dtransf_df(gp)**2)/self.variance
|
return ((self.gp_link.transf(gp)-obs)*self.gp_link.d2transf_df2(gp) + self.gp_link.dtransf_df(gp)**2)/self.variance
|
||||||
|
|
||||||
|
def _d3nlog_mass_dgp3(self, gp, obs, extra_data=None):
|
||||||
|
"""
|
||||||
|
Third order derivative log-likelihood function at y given f w.r.t f
|
||||||
|
Chained with link function deriative
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
\\frac{d^{3} \\ln p(y_{i}|f_{i})}{d^{3}f} = 0
|
||||||
|
|
||||||
|
:param gp: latent variables (f)
|
||||||
|
:type gp: Nx1 array
|
||||||
|
:param obs: data (y)
|
||||||
|
:type obs: Nx1 array
|
||||||
|
:param extra_data: extra_data which is not used in student t distribution - not used
|
||||||
|
:returns: third derivative of likelihood evaluated at points f
|
||||||
|
:rtype: Nx1 array
|
||||||
|
"""
|
||||||
|
assert gp.shape == obs.shape
|
||||||
|
d2lambda_df2 = self.gp_link.d2transf_df2(gp)
|
||||||
|
return ((self.gp_link.transf(gp)-obs)*self.gp_link.d3transf_df3(gp) - self.gp_link.dtransf_df(gp)*d2lambda_df2 + d2lambda_df2)/self.variance
|
||||||
|
|
||||||
|
def _dnlog_mass_dvar(self, gp, obs, extra_data=None):
|
||||||
|
"""
|
||||||
|
Gradient of the negative log-likelihood function at y given f, w.r.t variance parameter (noise_variance)
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
\\frac{d \\ln p(y_{i}|f_{i})}{d\\sigma^{2}} = \\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - f_{i})^{2}}{2\\sigma^{4}}
|
||||||
|
|
||||||
|
:param gp: latent variables (f)
|
||||||
|
:type gp: Nx1 array
|
||||||
|
:param obs: data (y)
|
||||||
|
:type obs: Nx1 array
|
||||||
|
:param extra_data: extra_data which is not used in student t distribution - not used
|
||||||
|
:returns: derivative of likelihood evaluated at points f w.r.t variance parameter
|
||||||
|
:rtype: float
|
||||||
|
"""
|
||||||
|
assert gp.shape == obs.shape
|
||||||
|
e = (obs - self.gp_link.transf(gp))
|
||||||
|
s_4 = 1.0/(self.variance**2)
|
||||||
|
dnlik_dsigma = 0.5*self.N/self.variance - 0.5*s_4*np.dot(e.T, e)
|
||||||
|
return np.sum(dnlik_dsigma) # Sure about this sum?
|
||||||
|
|
||||||
|
def _dnlog_mass_dgp_dvar(self, gp, obs, extra_data=None):
|
||||||
|
"""
|
||||||
|
Derivative of the dlik_df w.r.t variance parameter (noise_variance)
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
\\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|f_{i})}{df}) = \\frac{1}{\\sigma^{4}}(-y_{i} + f_{i})
|
||||||
|
|
||||||
|
:param y: data
|
||||||
|
:type y: Nx1 array
|
||||||
|
:param f: latent variables f
|
||||||
|
:type f: Nx1 array
|
||||||
|
:param extra_data: extra_data which is not used in student t distribution - not used
|
||||||
|
:returns: derivative of likelihood evaluated at points f w.r.t variance parameter
|
||||||
|
:rtype: Nx1 array
|
||||||
|
"""
|
||||||
|
assert gp.shape == obs.shape
|
||||||
|
s_4 = 1.0/(self.variance**2)
|
||||||
|
dnlik_grad_dsigma = s_4*(obs - self.gp_link.transf(gp))*self.gp_link.dtransf_df(gp)
|
||||||
|
return dnlik_grad_dsigma
|
||||||
|
|
||||||
|
def _d2nlog_mass_dgp2_dvar(self, gp, obs, extra_data=None):
|
||||||
|
"""
|
||||||
|
Gradient of the hessian (d2lik_d2f) w.r.t variance parameter (noise_variance)
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
\\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f}) = \\frac{1}{\\sigma^{4}}
|
||||||
|
|
||||||
|
:param gp: latent variables (f)
|
||||||
|
:type gp: Nx1 array
|
||||||
|
:param obs: data (y)
|
||||||
|
:type obs: Nx1 array
|
||||||
|
:param extra_data: extra_data which is not used in student t distribution - not used
|
||||||
|
:returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
|
||||||
|
:rtype: Nx1 array
|
||||||
|
"""
|
||||||
|
assert gp.shape == obs.shape
|
||||||
|
s_4 = 1.0/(self.variance**2)
|
||||||
|
#FIXME: Why squared?
|
||||||
|
dnlik_hess_dvar = -s_4*((self.gp_link.transf(gp)-obs)*self.gp_link.d2transf_df2(gp) + self.gp_link.dtransf_df(gp)**2)
|
||||||
|
return dnlik_hess_dvar
|
||||||
|
|
||||||
def _mean(self,gp):
|
def _mean(self,gp):
|
||||||
"""
|
"""
|
||||||
Expected value of y under the Mass (or density) function p(y|f)
|
Expected value of y under the Mass (or density) function p(y|f)
|
||||||
|
|
@ -138,150 +260,3 @@ class Gaussian(NoiseDistribution):
|
||||||
|
|
||||||
def _d2variance_dgp2(self,gp):
|
def _d2variance_dgp2(self,gp):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def lik_function(self, y, f, extra_data=None):
|
|
||||||
"""
|
|
||||||
Log likelihood function
|
|
||||||
|
|
||||||
.. math::
|
|
||||||
\\ln p(y_{i}|f_{i}) = -\\frac{D \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - f_{i})^{T}\\sigma^{-2}(y_{i} - f_{i})}{2}
|
|
||||||
|
|
||||||
:param y: data
|
|
||||||
:type y: Nx1 array
|
|
||||||
:param f: latent variables f
|
|
||||||
:type f: Nx1 array
|
|
||||||
:param extra_data: extra_data which is not used in student t distribution - not used
|
|
||||||
:returns: likelihood evaluated for this point
|
|
||||||
:rtype: float
|
|
||||||
"""
|
|
||||||
assert y.shape == f.shape
|
|
||||||
e = y - f
|
|
||||||
objective = (- 0.5*self.N*np.log(2*np.pi)
|
|
||||||
- 0.5*self.ln_det_K
|
|
||||||
- (0.5/self.variance)*np.sum(np.square(e)) # As long as K is diagonal
|
|
||||||
)
|
|
||||||
return np.sum(objective)
|
|
||||||
|
|
||||||
def dlik_df(self, y, f, extra_data=None):
|
|
||||||
"""
|
|
||||||
Gradient of the link function at y, given f w.r.t f
|
|
||||||
|
|
||||||
.. math::
|
|
||||||
\\frac{d \\ln p(y_{i}|f_{i})}{df} = \\frac{1}{\\sigma^{2}}(y_{i} - f_{i})
|
|
||||||
|
|
||||||
:param y: data
|
|
||||||
:type y: Nx1 array
|
|
||||||
:param f: latent variables f
|
|
||||||
:type f: Nx1 array
|
|
||||||
:param extra_data: extra_data which is not used in student t distribution - not used
|
|
||||||
:returns: gradient of likelihood evaluated at points
|
|
||||||
:rtype: Nx1 array
|
|
||||||
|
|
||||||
"""
|
|
||||||
assert y.shape == f.shape
|
|
||||||
s2_i = (1.0/self.variance)
|
|
||||||
grad = s2_i*y - s2_i*f
|
|
||||||
return grad
|
|
||||||
|
|
||||||
def d2lik_d2f(self, y, f, extra_data=None):
|
|
||||||
"""
|
|
||||||
Hessian at y, given f, w.r.t f the hessian will be 0 unless i == j
|
|
||||||
i.e. second derivative lik_function at y given f_{i} f_{j} w.r.t f_{i} and f_{j}
|
|
||||||
|
|
||||||
.. math::
|
|
||||||
\\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f} = -\\frac{1}{\\sigma^{2}}
|
|
||||||
|
|
||||||
:param y: data
|
|
||||||
:type y: Nx1 array
|
|
||||||
:param f: latent variables f
|
|
||||||
:type f: Nx1 array
|
|
||||||
:param extra_data: extra_data which is not used in student t distribution - not used
|
|
||||||
:returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
|
|
||||||
:rtype: Nx1 array
|
|
||||||
|
|
||||||
.. Note::
|
|
||||||
Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
|
|
||||||
(the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
|
|
||||||
"""
|
|
||||||
assert y.shape == f.shape
|
|
||||||
hess = -(1.0/self.variance)*np.ones((self.N, 1))
|
|
||||||
return hess
|
|
||||||
|
|
||||||
def d3lik_d3f(self, y, f, extra_data=None):
|
|
||||||
"""
|
|
||||||
Third order derivative log-likelihood function at y given f w.r.t f
|
|
||||||
|
|
||||||
.. math::
|
|
||||||
\\frac{d^{3} \\ln p(y_{i}|f_{i})}{d^{3}f} = 0
|
|
||||||
|
|
||||||
:param y: data
|
|
||||||
:type y: Nx1 array
|
|
||||||
:param f: latent variables f
|
|
||||||
:type f: Nx1 array
|
|
||||||
:param extra_data: extra_data which is not used in student t distribution - not used
|
|
||||||
:returns: third derivative of likelihood evaluated at points f
|
|
||||||
:rtype: Nx1 array
|
|
||||||
"""
|
|
||||||
assert y.shape == f.shape
|
|
||||||
d3lik_d3f = np.diagonal(0*self.I)[:, None]
|
|
||||||
return d3lik_d3f
|
|
||||||
|
|
||||||
def dlik_dvar(self, y, f, extra_data=None):
|
|
||||||
"""
|
|
||||||
Gradient of the log-likelihood function at y given f, w.r.t variance parameter (noise_variance)
|
|
||||||
|
|
||||||
.. math::
|
|
||||||
\\frac{d \\ln p(y_{i}|f_{i})}{d\\sigma^{2}} = \\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - f_{i})^{2}}{2\\sigma^{4}}
|
|
||||||
|
|
||||||
:param y: data
|
|
||||||
:type y: Nx1 array
|
|
||||||
:param f: latent variables f
|
|
||||||
:type f: Nx1 array
|
|
||||||
:param extra_data: extra_data which is not used in student t distribution - not used
|
|
||||||
:returns: derivative of likelihood evaluated at points f w.r.t variance parameter
|
|
||||||
:rtype: float
|
|
||||||
"""
|
|
||||||
assert y.shape == f.shape
|
|
||||||
e = y - f
|
|
||||||
s_4 = 1.0/(self.variance**2)
|
|
||||||
dlik_dsigma = -0.5*self.N/self.variance + 0.5*s_4*np.dot(e.T, e)
|
|
||||||
return np.sum(dlik_dsigma) # Sure about this sum?
|
|
||||||
|
|
||||||
def dlik_df_dvar(self, y, f, extra_data=None):
|
|
||||||
"""
|
|
||||||
Derivative of the dlik_df w.r.t variance parameter (noise_variance)
|
|
||||||
|
|
||||||
.. math::
|
|
||||||
\\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|f_{i})}{df}) = \\frac{1}{\\sigma^{4}}(-y_{i} + f_{i})
|
|
||||||
|
|
||||||
:param y: data
|
|
||||||
:type y: Nx1 array
|
|
||||||
:param f: latent variables f
|
|
||||||
:type f: Nx1 array
|
|
||||||
:param extra_data: extra_data which is not used in student t distribution - not used
|
|
||||||
:returns: derivative of likelihood evaluated at points f w.r.t variance parameter
|
|
||||||
:rtype: Nx1 array
|
|
||||||
"""
|
|
||||||
assert y.shape == f.shape
|
|
||||||
s_4 = 1.0/(self.variance**2)
|
|
||||||
dlik_grad_dsigma = -np.dot(s_4*self.I, y) + np.dot(s_4*self.I, f)
|
|
||||||
return dlik_grad_dsigma
|
|
||||||
|
|
||||||
def d2lik_d2f_dvar(self, y, f, extra_data=None):
|
|
||||||
"""
|
|
||||||
Gradient of the hessian (d2lik_d2f) w.r.t variance parameter (noise_variance)
|
|
||||||
|
|
||||||
.. math::
|
|
||||||
\\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f}) = \\frac{1}{\\sigma^{4}}
|
|
||||||
|
|
||||||
:param y: data
|
|
||||||
:type y: Nx1 array
|
|
||||||
:param f: latent variables f
|
|
||||||
:type f: Nx1 array
|
|
||||||
:param extra_data: extra_data which is not used in student t distribution - not used
|
|
||||||
:returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
|
|
||||||
:rtype: Nx1 array
|
|
||||||
"""
|
|
||||||
assert y.shape == f.shape
|
|
||||||
dlik_hess_dsigma = np.diag((1.0/(self.variance**2))*self.I)[:, None]
|
|
||||||
return dlik_hess_dsigma
|
|
||||||
|
|
|
||||||
|
|
@ -24,19 +24,25 @@ class GPTransformation(object):
|
||||||
"""
|
"""
|
||||||
Gaussian process tranformation function, latent space -> output space
|
Gaussian process tranformation function, latent space -> output space
|
||||||
"""
|
"""
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
def dtransf_df(self,f):
|
def dtransf_df(self,f):
|
||||||
"""
|
"""
|
||||||
derivative of transf(f) w.r.t. f
|
derivative of transf(f) w.r.t. f
|
||||||
"""
|
"""
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
def d2transf_df2(self,f):
|
def d2transf_df2(self,f):
|
||||||
"""
|
"""
|
||||||
second derivative of transf(f) w.r.t. f
|
second derivative of transf(f) w.r.t. f
|
||||||
"""
|
"""
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def d3transf_df3(self,f):
|
||||||
|
"""
|
||||||
|
third derivative of transf(f) w.r.t. f
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
class Identity(GPTransformation):
|
class Identity(GPTransformation):
|
||||||
"""
|
"""
|
||||||
|
|
@ -54,6 +60,9 @@ class Identity(GPTransformation):
|
||||||
def d2transf_df2(self,f):
|
def d2transf_df2(self,f):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
def d3transf_df3(self,f):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
class Probit(GPTransformation):
|
class Probit(GPTransformation):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -40,30 +40,30 @@ class StudentT(NoiseDistribution):
|
||||||
def variance(self, extra_data=None):
|
def variance(self, extra_data=None):
|
||||||
return (self.v / float(self.v - 2)) * self.sigma2
|
return (self.v / float(self.v - 2)) * self.sigma2
|
||||||
|
|
||||||
def lik_function(self, y, f, extra_data=None):
|
def _nlog_mass(self, gp, obs, extra_data=None):
|
||||||
"""
|
"""
|
||||||
Log Likelihood Function
|
Log Likelihood Function
|
||||||
|
|
||||||
.. math::
|
.. math::
|
||||||
\\ln p(y_{i}|f_{i}) = \\ln \\Gamma(\\frac{v+1}{2}) - \\ln \\Gamma(\\frac{v}{2})\\sqrt{v \\pi}\sigma - \\frac{v+1}{2}\\ln (1 + \\frac{1}{v}\\left(\\frac{y_{i} - f_{i}}{\\sigma}\\right)^2
|
\\ln p(y_{i}|f_{i}) = \\ln \\Gamma(\\frac{v+1}{2}) - \\ln \\Gamma(\\frac{v}{2})\\sqrt{v \\pi}\sigma - \\frac{v+1}{2}\\ln (1 + \\frac{1}{v}\\left(\\frac{y_{i} - f_{i}}{\\sigma}\\right)^2
|
||||||
|
|
||||||
:param y: data
|
:param gp: latent variables (f)
|
||||||
:type y: Nx1 array
|
:type gp: Nx1 array
|
||||||
:param f: latent variables f
|
:param obs: data (y)
|
||||||
:type f: Nx1 array
|
:type obs: Nx1 array
|
||||||
:param extra_data: extra_data which is not used in student t distribution - not used
|
:param extra_data: extra_data which is not used in student t distribution - not used
|
||||||
:returns: likelihood evaluated for this point
|
:returns: likelihood evaluated for this point
|
||||||
:rtype: float
|
:rtype: float
|
||||||
|
|
||||||
"""
|
"""
|
||||||
assert y.shape == f.shape
|
assert gp.shape == obs.shape
|
||||||
e = y - f
|
e = obs - self.gp_link.transf(gp)
|
||||||
objective = (+ gammaln((self.v + 1) * 0.5)
|
objective = (+ gammaln((self.v + 1) * 0.5)
|
||||||
- gammaln(self.v * 0.5)
|
- gammaln(self.v * 0.5)
|
||||||
- 0.5*np.log(self.sigma2 * self.v * np.pi)
|
- 0.5*np.log(self.sigma2 * self.v * np.pi)
|
||||||
- 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
|
- 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
|
||||||
)
|
)
|
||||||
return np.sum(objective)
|
return -np.sum(objective)
|
||||||
|
|
||||||
def dlik_df(self, y, f, extra_data=None):
|
def dlik_df(self, y, f, extra_data=None):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,7 @@ def dparam_checkgrad(func, dfunc, params, args, constrain_positive=True, randomi
|
||||||
|
|
||||||
class LaplaceTests(unittest.TestCase):
|
class LaplaceTests(unittest.TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.N = 50
|
self.N = 5
|
||||||
self.D = 3
|
self.D = 3
|
||||||
self.X = np.random.rand(self.N, self.D)*10
|
self.X = np.random.rand(self.N, self.D)*10
|
||||||
|
|
||||||
|
|
@ -101,6 +101,25 @@ class LaplaceTests(unittest.TestCase):
|
||||||
-np.log(self.gauss._mass(self.f.copy(), self.Y.copy())),
|
-np.log(self.gauss._mass(self.f.copy(), self.Y.copy())),
|
||||||
self.gauss._nlog_mass(self.f.copy(), self.Y.copy()))
|
self.gauss._nlog_mass(self.f.copy(), self.Y.copy()))
|
||||||
|
|
||||||
|
def test_mass_dnlog_mass_dgp_ndlik_df(self):
|
||||||
|
print "\n{}".format(inspect.stack()[0][3])
|
||||||
|
np.testing.assert_almost_equal(
|
||||||
|
self.gauss._dnlog_mass_dgp(gp=self.f.copy(), obs=self.Y.copy()),
|
||||||
|
-self.gauss.dlik_df(y=self.Y.copy(), f=self.f.copy()))
|
||||||
|
|
||||||
|
def test_mass_d2nlog_mass_dgp2_nd2lik_d2f(self):
|
||||||
|
print "\n{}".format(inspect.stack()[0][3])
|
||||||
|
np.testing.assert_almost_equal(
|
||||||
|
self.gauss._d2nlog_mass_dgp2(gp=self.f.copy(), obs=self.Y.copy()),
|
||||||
|
-self.gauss.d2lik_d2f(y=self.Y.copy(), f=self.f.copy()))
|
||||||
|
|
||||||
|
def test_mass_d2nlog_mass_dgp3_nd2lik_d3f(self):
|
||||||
|
print "\n{}".format(inspect.stack()[0][3])
|
||||||
|
np.testing.assert_almost_equal(
|
||||||
|
self.gauss._d3nlog_mass_dgp3(gp=self.f.copy(), obs=self.Y.copy()),
|
||||||
|
-self.gauss.d3lik_d3f(y=self.Y.copy(), f=self.f.copy()))
|
||||||
|
|
||||||
|
|
||||||
def test_gaussian_dnlog_mass_dgp(self):
|
def test_gaussian_dnlog_mass_dgp(self):
|
||||||
print "\n{}".format(inspect.stack()[0][3])
|
print "\n{}".format(inspect.stack()[0][3])
|
||||||
link = functools.partial(self.gauss._nlog_mass, obs=self.Y)
|
link = functools.partial(self.gauss._nlog_mass, obs=self.Y)
|
||||||
|
|
@ -119,24 +138,38 @@ class LaplaceTests(unittest.TestCase):
|
||||||
grad.checkgrad(verbose=1)
|
grad.checkgrad(verbose=1)
|
||||||
self.assertTrue(grad.checkgrad())
|
self.assertTrue(grad.checkgrad())
|
||||||
|
|
||||||
|
def test_gaussian_d3nlog_mass_d3gp(self):
|
||||||
def test_gaussian_dlik_df(self):
|
|
||||||
print "\n{}".format(inspect.stack()[0][3])
|
print "\n{}".format(inspect.stack()[0][3])
|
||||||
link = functools.partial(self.gauss.lik_function, self.Y)
|
link = functools.partial(self.gauss._d2nlog_mass_dgp2, obs=self.Y)
|
||||||
dlik_df = functools.partial(self.gauss.dlik_df, self.Y)
|
dlik_df = functools.partial(self.gauss._d3nlog_mass_dgp3, obs=self.Y)
|
||||||
grad = GradientChecker(link, dlik_df, self.f.copy(), 'f')
|
grad = GradientChecker(link, dlik_df, self.f.copy(), 'g')
|
||||||
grad.randomize()
|
grad.randomize()
|
||||||
grad.checkgrad(verbose=1)
|
grad.checkgrad(verbose=1)
|
||||||
self.assertTrue(grad.checkgrad())
|
self.assertTrue(grad.checkgrad())
|
||||||
|
|
||||||
def test_gaussian_d2lik_d2f(self):
|
def test_gaussian_dnlog_mass_dvar(self):
|
||||||
print "\n{}".format(inspect.stack()[0][3])
|
print "\n{}".format(inspect.stack()[0][3])
|
||||||
dlik_df = functools.partial(self.gauss.dlik_df, self.Y)
|
self.assertTrue(
|
||||||
d2lik_d2f = functools.partial(self.gauss.d2lik_d2f, self.Y)
|
dparam_checkgrad(self.gauss._nlog_mass, self.gauss._dnlog_mass_dvar,
|
||||||
grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
|
[self.var], args=(self.Y, self.f), constrain_positive=True,
|
||||||
grad.randomize()
|
randomize=False, verbose=True)
|
||||||
grad.checkgrad(verbose=1)
|
)
|
||||||
self.assertTrue(grad.checkgrad())
|
|
||||||
|
def test_gaussian_dnlog_mass_dgp_dvar(self):
|
||||||
|
print "\n{}".format(inspect.stack()[0][3])
|
||||||
|
self.assertTrue(
|
||||||
|
dparam_checkgrad(self.gauss._dnlog_mass_dgp, self.gauss._dnlog_mass_dgp_dvar,
|
||||||
|
[self.var], args=(self.Y, self.f), constrain_positive=True,
|
||||||
|
randomize=False, verbose=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_gaussian_d2nlog_mass_d2gp_dvar(self):
|
||||||
|
print "\n{}".format(inspect.stack()[0][3])
|
||||||
|
self.assertTrue(
|
||||||
|
dparam_checkgrad(self.gauss._d2nlog_mass_dgp2, self.gauss._d2nlog_mass_dgp2_dvar,
|
||||||
|
[self.var], args=(self.Y, self.f), constrain_positive=True,
|
||||||
|
randomize=False, verbose=True)
|
||||||
|
)
|
||||||
|
|
||||||
""" Gradchecker fault """
|
""" Gradchecker fault """
|
||||||
@unittest.expectedFailure
|
@unittest.expectedFailure
|
||||||
|
|
@ -154,8 +187,8 @@ class LaplaceTests(unittest.TestCase):
|
||||||
self.f = np.random.rand(self.N, 1)
|
self.f = np.random.rand(self.N, 1)
|
||||||
self.gauss = GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N)
|
self.gauss = GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N)
|
||||||
|
|
||||||
dlik_df = functools.partial(self.gauss.dlik_df, self.Y)
|
dlik_df = functools.partial(self.gauss._dnlog_mass_dgp, obs=self.Y)
|
||||||
d2lik_d2f = functools.partial(self.gauss.d2lik_d2f, self.Y)
|
d2lik_d2f = functools.partial(self.gauss._d2nlog_mass_dgp2, obs=self.Y)
|
||||||
grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
|
grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
|
||||||
grad.randomize()
|
grad.randomize()
|
||||||
grad.checkgrad(verbose=1)
|
grad.checkgrad(verbose=1)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue