diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py index 55b69bd7..37cd71ec 100644 --- a/GPy/kern/__init__.py +++ b/GPy/kern/__init__.py @@ -3,7 +3,7 @@ from _src.rbf import RBF from _src.linear import Linear, LinearFull from _src.static import Bias, White from _src.brownian import Brownian -from _src.sympykern import Sympykern +from _src.symbolic import Symbolic from _src.stationary import Exponential, Matern32, Matern52, ExpQuad, RatQuad, Cosine from _src.mlp import MLP from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52 diff --git a/GPy/kern/_src/sympykern.py b/GPy/kern/_src/symbolic.py similarity index 56% rename from GPy/kern/_src/sympykern.py rename to GPy/kern/_src/symbolic.py index 6f066e98..2d4cbc59 100644 --- a/GPy/kern/_src/sympykern.py +++ b/GPy/kern/_src/symbolic.py @@ -11,7 +11,7 @@ from kern import Kern from ...core.parameterization import Param from ...core.parameterization.transformations import Logexp -class Sympykern(Kern): +class Symbolic(Kern): """ A kernel object, where all the hard work in done by sympy. @@ -26,10 +26,8 @@ class Sympykern(Kern): - to handle multiple inputs, call them x_1, z_1, etc - to handle multpile correlated outputs, you'll need to add parameters with an index, such as lengthscale_i and lengthscale_j. """ - def __init__(self, input_dim, k=None, output_dim=1, name=None, param=None, active_dims=None): + def __init__(self, input_dim, k=None, output_dim=1, name='symbolic', param=None, active_dims=None, operators=None): - if name is None: - name='sympykern' if k is None: raise ValueError, "You must provide an argument for the covariance function." super(Sympykern, self).__init__(input_dim, active_dims, name) @@ -60,7 +58,6 @@ class Sympykern(Kern): # extract parameter names from the covariance thetas = sorted([e for e in sp_vars if not (e.name[0:2]=='x_' or e.name[0:2]=='z_')],key=lambda e:e.name) - # Look for parameters with index (subscripts), they are associated with different outputs. if self.output_dim>1: self._sp_theta_i = sorted([e for e in thetas if (e.name[-2:]=='_i')], key=lambda e:e.name) @@ -117,6 +114,12 @@ class Sympykern(Kern): self.arg_list += self._sp_theta_i + self._sp_theta_j self.diag_arg_list += self._sp_theta_i + # Check if there are additional linear operators on the covariance. + self._sp_operators = operators + # TODO: Deal with linear operators + #if self._sp_operators: + # for operator in self._sp_operators: + # psi_stats aren't yet implemented. if False: self.compute_psi_stats() @@ -254,3 +257,176 @@ class Sympykern(Kern): self._reverse_arguments[theta_i.name] = self._arguments[theta_j.name].T self._reverse_arguments[theta_j.name] = self._arguments[theta_i.name].T +if False: + class Symcombine(CombinationKernel): + """ + Combine list of given sympy covariances together with the provided operations. + """ + def __init__(self, subkerns, operations, name='sympy_combine'): + super(Symcombine, self).__init__(subkerns, name) + for subkern, operation in zip(subkerns, operations): + self._sp_k += self._k_double_operate(subkern._sp_k, operation) + + #def _double_operate(self, k, operation): + + + @Cache_this(limit=2, force_kwargs=['which_parts']) + def K(self, X, X2=None, which_parts=None): + """ + Combine covariances with a linear operator. + """ + assert X.shape[1] == self.input_dim + if which_parts is None: + which_parts = self.parts + elif not isinstance(which_parts, (list, tuple)): + # if only one part is given + which_parts = [which_parts] + return reduce(np.add, (p.K(X, X2) for p in which_parts)) + + @Cache_this(limit=2, force_kwargs=['which_parts']) + def Kdiag(self, X, which_parts=None): + assert X.shape[1] == self.input_dim + if which_parts is None: + which_parts = self.parts + elif not isinstance(which_parts, (list, tuple)): + # if only one part is given + which_parts = [which_parts] + return reduce(np.add, (p.Kdiag(X) for p in which_parts)) + + def update_gradients_full(self, dL_dK, X, X2=None): + [p.update_gradients_full(dL_dK, X, X2) for p in self.parts] + + def update_gradients_diag(self, dL_dK, X): + [p.update_gradients_diag(dL_dK, X) for p in self.parts] + + def gradients_X(self, dL_dK, X, X2=None): + """Compute the gradient of the objective function with respect to X. + + :param dL_dK: An array of gradients of the objective function with respect to the covariance function. + :type dL_dK: np.ndarray (num_samples x num_inducing) + :param X: Observed data inputs + :type X: np.ndarray (num_samples x input_dim) + :param X2: Observed data inputs (optional, defaults to X) + :type X2: np.ndarray (num_inducing x input_dim)""" + + target = np.zeros(X.shape) + [target.__iadd__(p.gradients_X(dL_dK, X, X2)) for p in self.parts] + return target + + def gradients_X_diag(self, dL_dKdiag, X): + target = np.zeros(X.shape) + [target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts] + return target + + def psi0(self, Z, variational_posterior): + return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts)) + + def psi1(self, Z, variational_posterior): + return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts)) + + def psi2(self, Z, variational_posterior): + psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts)) + #return psi2 + # compute the "cross" terms + from static import White, Bias + from rbf import RBF + #from rbf_inv import RBFInv + from linear import Linear + #ffrom fixed import Fixed + + for p1, p2 in itertools.combinations(self.parts, 2): + # i1, i2 = p1.active_dims, p2.active_dims + # white doesn;t combine with anything + if isinstance(p1, White) or isinstance(p2, White): + pass + # rbf X bias + #elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)): + elif isinstance(p1, Bias) and isinstance(p2, (RBF, Linear)): + tmp = p2.psi1(Z, variational_posterior) + psi2 += p1.variance * (tmp[:, :, None] + tmp[:, None, :]) + #elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)): + elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)): + tmp = p1.psi1(Z, variational_posterior) + psi2 += p2.variance * (tmp[:, :, None] + tmp[:, None, :]) + elif isinstance(p2, (RBF, Linear)) and isinstance(p1, (RBF, Linear)): + assert np.intersect1d(p1.active_dims, p2.active_dims).size == 0, "only non overlapping kernel dimensions allowed so far" + tmp1 = p1.psi1(Z, variational_posterior) + tmp2 = p2.psi1(Z, variational_posterior) + psi2 += (tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :]) + else: + raise NotImplementedError, "psi2 cannot be computed for this kernel" + return psi2 + + def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + from static import White, Bias + for p1 in self.parts: + #compute the effective dL_dpsi1. Extra terms appear becaue of the cross terms in psi2! + eff_dL_dpsi1 = dL_dpsi1.copy() + for p2 in self.parts: + if p2 is p1: + continue + if isinstance(p2, White): + continue + elif isinstance(p2, Bias): + eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2. + else:# np.setdiff1d(p1.active_dims, ar2, assume_unique): # TODO: Careful, not correct for overlapping active_dims + eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2. + p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior) + + def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + from static import White, Bias + target = np.zeros(Z.shape) + for p1 in self.parts: + #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2! + eff_dL_dpsi1 = dL_dpsi1.copy() + for p2 in self.parts: + if p2 is p1: + continue + if isinstance(p2, White): + continue + elif isinstance(p2, Bias): + eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2. + else: + eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2. + target += p1.gradients_Z_expectations(eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior) + return target + + def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + from static import White, Bias + target_mu = np.zeros(variational_posterior.shape) + target_S = np.zeros(variational_posterior.shape) + for p1 in self._parameters_: + #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2! + eff_dL_dpsi1 = dL_dpsi1.copy() + for p2 in self._parameters_: + if p2 is p1: + continue + if isinstance(p2, White): + continue + elif isinstance(p2, Bias): + eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2. + else: + eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2. + a, b = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior) + target_mu += a + target_S += b + return target_mu, target_S + + def _getstate(self): + """ + Get the current state of the class, + here just all the indices, rest can get recomputed + """ + return super(Add, self)._getstate() + + def _setstate(self, state): + super(Add, self)._setstate(state) + + def add(self, other, name='sum'): + if isinstance(other, Add): + other_params = other._parameters_.copy() + for p in other_params: + other.remove_parameter(p) + self.add_parameters(*other_params) + else: self.add_parameter(other) + return self diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py index 28e44541..87229081 100644 --- a/GPy/likelihoods/__init__.py +++ b/GPy/likelihoods/__init__.py @@ -6,3 +6,4 @@ from poisson import Poisson from student_t import StudentT from likelihood import Likelihood from mixed_noise import MixedNoise +from symbolic import Symbolic diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py index 371fbe63..7b867954 100644 --- a/GPy/likelihoods/bernoulli.py +++ b/GPy/likelihoods/bernoulli.py @@ -15,7 +15,7 @@ class Bernoulli(Likelihood): p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}} .. Note:: - Y is expected to take values in {-1, 1} TODO: {0, 1}?? + Y takes values in either {-1, 1} or {0, 1}. link function should have the domain [0, 1], e.g. probit (default) or Heaviside .. See also:: @@ -54,10 +54,10 @@ class Bernoulli(Likelihood): """ if Y_i == 1: sign = 1. - elif Y_i == 0: + elif Y_i == 0 or Y_i == -1: sign = -1 else: - raise ValueError("bad value for Bernouilli observation (0, 1)") + raise ValueError("bad value for Bernoulli observation (0, 1)") if isinstance(self.gp_link, link_functions.Probit): z = sign*v_i/np.sqrt(tau_i**2 + tau_i) Z_hat = std_norm_cdf(z) @@ -95,15 +95,15 @@ class Bernoulli(Likelihood): else: return np.nan - def pdf_link(self, link_f, y, Y_metadata=None): + def pdf_link(self, inv_link_f, y, Y_metadata=None): """ - Likelihood function given link(f) + Likelihood function given inverse link of f. .. math:: p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}} - :param link_f: latent variables link(f) - :type link_f: Nx1 array + :param inv_link_f: latent variables inverse link of f. + :type inv_link_f: Nx1 array :param y: data :type y: Nx1 array :param Y_metadata: Y_metadata not used in bernoulli @@ -113,102 +113,106 @@ class Bernoulli(Likelihood): .. Note: Each y_i must be in {0, 1} """ - assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape - #objective = (link_f**y) * ((1.-link_f)**(1.-y)) - objective = np.where(y, link_f, 1.-link_f) + assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape + #objective = (inv_link_f**y) * ((1.-inv_link_f)**(1.-y)) + objective = np.where(y, inv_link_f, 1.-inv_link_f) return np.exp(np.sum(np.log(objective))) - def logpdf_link(self, link_f, y, Y_metadata=None): + def logpdf_link(self, inv_link_f, y, Y_metadata=None): """ - Log Likelihood function given link(f) + Log Likelihood function given inverse link of f. .. math:: \\ln p(y_{i}|\\lambda(f_{i})) = y_{i}\\log\\lambda(f_{i}) + (1-y_{i})\\log (1-f_{i}) - :param link_f: latent variables link(f) - :type link_f: Nx1 array + :param inv_link_f: latent variables inverse link of f. + :type inv_link_f: Nx1 array :param y: data :type y: Nx1 array :param Y_metadata: Y_metadata not used in bernoulli - :returns: log likelihood evaluated at points link(f) + :returns: log likelihood evaluated at points inverse link of f. :rtype: float """ - assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape - #objective = y*np.log(link_f) + (1.-y)*np.log(link_f) + assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape + #objective = y*np.log(inv_link_f) + (1.-y)*np.log(inv_link_f) state = np.seterr(divide='ignore') - objective = np.where(y==1, np.log(link_f), np.log(1-link_f)) + # TODO check y \in {0, 1} or {-1, 1} + objective = np.where(y==1, np.log(inv_link_f), np.log(1-inv_link_f)) np.seterr(**state) return np.sum(objective) - def dlogpdf_dlink(self, link_f, y, Y_metadata=None): + def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None): """ - Gradient of the pdf at y, given link(f) w.r.t link(f) + Gradient of the pdf at y, given inverse link of f w.r.t inverse link of f. .. math:: \\frac{d\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{y_{i}}{\\lambda(f_{i})} - \\frac{(1 - y_{i})}{(1 - \\lambda(f_{i}))} - :param link_f: latent variables link(f) - :type link_f: Nx1 array + :param inv_link_f: latent variables inverse link of f. + :type inv_link_f: Nx1 array :param y: data :type y: Nx1 array :param Y_metadata: Y_metadata not used in bernoulli - :returns: gradient of log likelihood evaluated at points link(f) + :returns: gradient of log likelihood evaluated at points inverse link of f. :rtype: Nx1 array """ - assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape - #grad = (y/link_f) - (1.-y)/(1-link_f) + assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape + #grad = (y/inv_link_f) - (1.-y)/(1-inv_link_f) state = np.seterr(divide='ignore') - grad = np.where(y, 1./link_f, -1./(1-link_f)) + # TODO check y \in {0, 1} or {-1, 1} + grad = np.where(y, 1./inv_link_f, -1./(1-inv_link_f)) np.seterr(**state) return grad - def d2logpdf_dlink2(self, link_f, y, Y_metadata=None): + def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None): """ - Hessian at y, given link_f, w.r.t link_f the hessian will be 0 unless i == j - i.e. second derivative logpdf at y given link(f_i) link(f_j) w.r.t link(f_i) and link(f_j) + Hessian at y, given inv_link_f, w.r.t inv_link_f the hessian will be 0 unless i == j + i.e. second derivative logpdf at y given inverse link of f_i and inverse link of f_j w.r.t inverse link of f_i and inverse link of f_j. .. math:: \\frac{d^{2}\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)^{2}} = \\frac{-y_{i}}{\\lambda(f)^{2}} - \\frac{(1-y_{i})}{(1-\\lambda(f))^{2}} - :param link_f: latent variables link(f) - :type link_f: Nx1 array + :param inv_link_f: latent variables inverse link of f. + :type inv_link_f: Nx1 array :param y: data :type y: Nx1 array :param Y_metadata: Y_metadata not used in bernoulli - :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points link(f)) + :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points inverse link of f. :rtype: Nx1 array .. Note:: Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases - (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i)) + (the distribution for y_i depends only on inverse link of f_i not on inverse link of f_(j!=i) """ - assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape - #d2logpdf_dlink2 = -y/(link_f**2) - (1-y)/((1-link_f)**2) + assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape + #d2logpdf_dlink2 = -y/(inv_link_f**2) - (1-y)/((1-inv_link_f)**2) state = np.seterr(divide='ignore') - d2logpdf_dlink2 = np.where(y, -1./np.square(link_f), -1./np.square(1.-link_f)) + # TODO check y \in {0, 1} or {-1, 1} + d2logpdf_dlink2 = np.where(y, -1./np.square(inv_link_f), -1./np.square(1.-inv_link_f)) np.seterr(**state) return d2logpdf_dlink2 - def d3logpdf_dlink3(self, link_f, y, Y_metadata=None): + def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None): """ - Third order derivative log-likelihood function at y given link(f) w.r.t link(f) + Third order derivative log-likelihood function at y given inverse link of f w.r.t inverse link of f .. math:: \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2y_{i}}{\\lambda(f)^{3}} - \\frac{2(1-y_{i}}{(1-\\lambda(f))^{3}} - :param link_f: latent variables link(f) - :type link_f: Nx1 array + :param inv_link_f: latent variables passed through inverse link of f. + :type inv_link_f: Nx1 array :param y: data :type y: Nx1 array :param Y_metadata: Y_metadata not used in bernoulli - :returns: third derivative of log likelihood evaluated at points link(f) + :returns: third derivative of log likelihood evaluated at points inverse_link(f) :rtype: Nx1 array """ - assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape - #d3logpdf_dlink3 = 2*(y/(link_f**3) - (1-y)/((1-link_f)**3)) + assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape + #d3logpdf_dlink3 = 2*(y/(inv_link_f**3) - (1-y)/((1-inv_link_f)**3)) state = np.seterr(divide='ignore') - d3logpdf_dlink3 = np.where(y, 2./(link_f**3), -2./((1.-link_f)**3)) + # TODO check y \in {0, 1} or {-1, 1} + d3logpdf_dlink3 = np.where(y, 2./(inv_link_f**3), -2./((1.-inv_link_f)**3)) np.seterr(**state) return d3logpdf_dlink3 diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py index aabe93ef..5761f3fb 100644 --- a/GPy/likelihoods/likelihood.py +++ b/GPy/likelihoods/likelihood.py @@ -16,20 +16,20 @@ class Likelihood(Parameterized): Likelihood base class, used to defing p(y|f). All instances use _inverse_ link functions, which can be swapped out. It is - expected that inherriting classes define a default inverse link function + expected that inheriting classes define a default inverse link function - To use this class, inherrit and define missing functionality. + To use this class, inherit and define missing functionality. - Inherriting classes *must* implement: + Inheriting classes *must* implement: pdf_link : a bound method which turns the output of the link function into the pdf logpdf_link : the logarithm of the above - To enable use with EP, inherriting classes *must* define: + To enable use with EP, inheriting classes *must* define: TODO: a suitable derivative function for any parameters of the class It is also desirable to define: moments_match_ep : a function to compute the EP moments If this isn't defined, the moments will be computed using 1D quadrature. - To enable use with Laplace approximation, inherriting classes *must* define: + To enable use with Laplace approximation, inheriting classes *must* define: Some derivative functions *AS TODO* For exact Gaussian inference, define *JH TODO* @@ -159,7 +159,7 @@ class Likelihood(Parameterized): def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None): """ - Numerical approximation to the predictive variance: V(Y_star) + Approximation to the predictive variance: V(Y_star) The following variance decomposition is used: V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) ) @@ -208,28 +208,28 @@ class Likelihood(Parameterized): # V(Y_star) = E[ V(Y_star|f_star) ] + E(Y_star**2|f_star) - E[Y_star|f_star]**2 return exp_var + var_exp - def pdf_link(self, link_f, y, Y_metadata=None): + def pdf_link(self, inv_link_f, y, Y_metadata=None): raise NotImplementedError - def logpdf_link(self, link_f, y, Y_metadata=None): + def logpdf_link(self, inv_link_f, y, Y_metadata=None): raise NotImplementedError - def dlogpdf_dlink(self, link_f, y, Y_metadata=None): + def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None): raise NotImplementedError - def d2logpdf_dlink2(self, link_f, y, Y_metadata=None): + def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None): raise NotImplementedError - def d3logpdf_dlink3(self, link_f, y, Y_metadata=None): + def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None): raise NotImplementedError - def dlogpdf_link_dtheta(self, link_f, y, Y_metadata=None): + def dlogpdf_link_dtheta(self, inv_link_f, y, Y_metadata=None): raise NotImplementedError - def dlogpdf_dlink_dtheta(self, link_f, y, Y_metadata=None): + def dlogpdf_dlink_dtheta(self, inv_link_f, y, Y_metadata=None): raise NotImplementedError - def d2logpdf_dlink2_dtheta(self, link_f, y, Y_metadata=None): + def d2logpdf_dlink2_dtheta(self, inv_link_f, y, Y_metadata=None): raise NotImplementedError def pdf(self, f, y, Y_metadata=None): @@ -247,8 +247,8 @@ class Likelihood(Parameterized): :returns: likelihood evaluated for this point :rtype: float """ - link_f = self.gp_link.transf(f) - return self.pdf_link(link_f, y, Y_metadata=Y_metadata) + inv_link_f = self.gp_link.transf(f) + return self.pdf_link(inv_link_f, y, Y_metadata=Y_metadata) def logpdf(self, f, y, Y_metadata=None): """ @@ -265,8 +265,8 @@ class Likelihood(Parameterized): :returns: log likelihood evaluated for this point :rtype: float """ - link_f = self.gp_link.transf(f) - return self.logpdf_link(link_f, y, Y_metadata=Y_metadata) + inv_link_f = self.gp_link.transf(f) + return self.logpdf_link(inv_link_f, y, Y_metadata=Y_metadata) def dlogpdf_df(self, f, y, Y_metadata=None): """ @@ -284,8 +284,8 @@ class Likelihood(Parameterized): :returns: derivative of log likelihood evaluated for this point :rtype: 1xN array """ - link_f = self.gp_link.transf(f) - dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, Y_metadata=Y_metadata) + inv_link_f = self.gp_link.transf(f) + dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata) dlink_df = self.gp_link.dtransf_df(f) return chain_1(dlogpdf_dlink, dlink_df) @@ -305,10 +305,10 @@ class Likelihood(Parameterized): :returns: second derivative of log likelihood evaluated for this point (diagonal only) :rtype: 1xN array """ - link_f = self.gp_link.transf(f) - d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, Y_metadata=Y_metadata) + inv_link_f = self.gp_link.transf(f) + d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata) dlink_df = self.gp_link.dtransf_df(f) - dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, Y_metadata=Y_metadata) + dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata) d2link_df2 = self.gp_link.d2transf_df2(f) return chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2) @@ -328,12 +328,12 @@ class Likelihood(Parameterized): :returns: third derivative of log likelihood evaluated for this point :rtype: float """ - link_f = self.gp_link.transf(f) - d3logpdf_dlink3 = self.d3logpdf_dlink3(link_f, y, Y_metadata=Y_metadata) + inv_link_f = self.gp_link.transf(f) + d3logpdf_dlink3 = self.d3logpdf_dlink3(inv_link_f, y, Y_metadata=Y_metadata) dlink_df = self.gp_link.dtransf_df(f) - d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, Y_metadata=Y_metadata) + d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata) d2link_df2 = self.gp_link.d2transf_df2(f) - dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, Y_metadata=Y_metadata) + dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata) d3link_df3 = self.gp_link.d3transf_df3(f) return chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3) @@ -342,10 +342,10 @@ class Likelihood(Parameterized): TODO: Doc strings """ if self.size > 0: - link_f = self.gp_link.transf(f) - return self.dlogpdf_link_dtheta(link_f, y, Y_metadata=Y_metadata) + inv_link_f = self.gp_link.transf(f) + return self.dlogpdf_link_dtheta(inv_link_f, y, Y_metadata=Y_metadata) else: - #Is no parameters so return an empty array for its derivatives + # There are no parameters so return an empty array for derivatives return np.zeros([1, 0]) def dlogpdf_df_dtheta(self, f, y, Y_metadata=None): @@ -353,12 +353,12 @@ class Likelihood(Parameterized): TODO: Doc strings """ if self.size > 0: - link_f = self.gp_link.transf(f) + inv_link_f = self.gp_link.transf(f) dlink_df = self.gp_link.dtransf_df(f) - dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, Y_metadata=Y_metadata) + dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata) return chain_1(dlogpdf_dlink_dtheta, dlink_df) else: - #Is no parameters so return an empty array for its derivatives + # There are no parameters so return an empty array for derivatives return np.zeros([f.shape[0], 0]) def d2logpdf_df2_dtheta(self, f, y, Y_metadata=None): @@ -366,14 +366,14 @@ class Likelihood(Parameterized): TODO: Doc strings """ if self.size > 0: - link_f = self.gp_link.transf(f) + inv_link_f = self.gp_link.transf(f) dlink_df = self.gp_link.dtransf_df(f) d2link_df2 = self.gp_link.d2transf_df2(f) - d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(link_f, y, Y_metadata=Y_metadata) - dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, Y_metadata=Y_metadata) + d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(inv_link_f, y, Y_metadata=Y_metadata) + dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata) return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2) else: - #Is no parameters so return an empty array for its derivatives + # There are no parameters so return an empty array for derivatives return np.zeros([f.shape[0], 0]) def _laplace_gradients(self, f, y, Y_metadata=None): @@ -411,7 +411,10 @@ class Likelihood(Parameterized): #compute the quantiles by sampling!!! N_samp = 1000 s = np.random.randn(mu.shape[0], N_samp)*np.sqrt(var) + mu + #ss_f = s.flatten() + #ss_y = self.samples(ss_f, Y_metadata) ss_y = self.samples(s, Y_metadata) + #ss_y = ss_y.reshape(mu.shape[0], N_samp) return [np.percentile(ss_y ,q, axis=1)[:,None] for q in quantiles] diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py index 47efd443..c057e789 100644 --- a/GPy/likelihoods/student_t.py +++ b/GPy/likelihoods/student_t.py @@ -26,8 +26,8 @@ class StudentT(Likelihood): gp_link = link_functions.Identity() super(StudentT, self).__init__(gp_link, name='Student_T') - - self.sigma2 = Param('t_noise', float(sigma2), Logexp()) + # sigma2 is not a noise parameter, it is a squared scale. + self.sigma2 = Param('t_scale2', float(sigma2), Logexp()) self.v = Param('deg_free', float(deg_free)) self.add_parameter(self.sigma2) self.add_parameter(self.v) @@ -46,23 +46,23 @@ class StudentT(Likelihood): self.sigma2.gradient = grads[0] self.v.gradient = grads[1] - def pdf_link(self, link_f, y, Y_metadata=None): + def pdf_link(self, inv_link_f, y, Y_metadata=None): """ Likelihood function given link(f) .. math:: p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \\lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}} - :param link_f: latent variables link(f) - :type link_f: Nx1 array + :param inv_link_f: latent variables link(f) + :type inv_link_f: Nx1 array :param y: data :type y: Nx1 array :param Y_metadata: Y_metadata which is not used in student t distribution :returns: likelihood evaluated for this point :rtype: float """ - assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape - e = y - link_f + assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape + e = y - inv_link_f #Careful gamma(big_number) is infinity! objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5)) / (np.sqrt(self.v * np.pi * self.sigma2))) @@ -70,15 +70,15 @@ class StudentT(Likelihood): ) return np.prod(objective) - def logpdf_link(self, link_f, y, Y_metadata=None): + def logpdf_link(self, inv_link_f, y, Y_metadata=None): """ Log Likelihood Function given link(f) .. math:: \\ln p(y_{i}|\lambda(f_{i})) = \\ln \\Gamma\\left(\\frac{v+1}{2}\\right) - \\ln \\Gamma\\left(\\frac{v}{2}\\right) - \\ln \\sqrt{v \\pi\\sigma^{2}} - \\frac{v+1}{2}\\ln \\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right) - :param link_f: latent variables (link(f)) - :type link_f: Nx1 array + :param inv_link_f: latent variables (link(f)) + :type inv_link_f: Nx1 array :param y: data :type y: Nx1 array :param Y_metadata: Y_metadata which is not used in student t distribution @@ -86,11 +86,11 @@ class StudentT(Likelihood): :rtype: float """ - assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape - e = y - link_f + assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape + e = y - inv_link_f #FIXME: - #Why does np.log(1 + (1/self.v)*((y-link_f)**2)/self.sigma2) suppress the divide by zero?! - #But np.log(1 + (1/float(self.v))*((y-link_f)**2)/self.sigma2) throws it correctly + #Why does np.log(1 + (1/self.v)*((y-inv_link_f)**2)/self.sigma2) suppress the divide by zero?! + #But np.log(1 + (1/float(self.v))*((y-inv_link_f)**2)/self.sigma2) throws it correctly #print - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2)) objective = (+ gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) @@ -99,15 +99,15 @@ class StudentT(Likelihood): ) return np.sum(objective) - def dlogpdf_dlink(self, link_f, y, Y_metadata=None): + def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None): """ Gradient of the log likelihood function at y, given link(f) w.r.t link(f) .. math:: \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{(v+1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v} - :param link_f: latent variables (f) - :type link_f: Nx1 array + :param inv_link_f: latent variables (f) + :type inv_link_f: Nx1 array :param y: data :type y: Nx1 array :param Y_metadata: Y_metadata which is not used in student t distribution @@ -115,12 +115,12 @@ class StudentT(Likelihood): :rtype: Nx1 array """ - assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape - e = y - link_f + assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape + e = y - inv_link_f grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2)) return grad - def d2logpdf_dlink2(self, link_f, y, Y_metadata=None): + def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None): """ Hessian at y, given link(f), w.r.t link(f) i.e. second derivative logpdf at y given link(f_i) and link(f_j) w.r.t link(f_i) and link(f_j) @@ -129,8 +129,8 @@ class StudentT(Likelihood): .. math:: \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{(v+1)((y_{i}-\lambda(f_{i}))^{2} - \\sigma^{2}v)}{((y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v)^{2}} - :param link_f: latent variables link(f) - :type link_f: Nx1 array + :param inv_link_f: latent variables inv_link(f) + :type inv_link_f: Nx1 array :param y: data :type y: Nx1 array :param Y_metadata: Y_metadata which is not used in student t distribution @@ -141,90 +141,90 @@ class StudentT(Likelihood): Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i)) """ - assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape - e = y - link_f + assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape + e = y - inv_link_f hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2) return hess - def d3logpdf_dlink3(self, link_f, y, Y_metadata=None): + def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None): """ Third order derivative log-likelihood function at y given link(f) w.r.t link(f) .. math:: \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{-2(v+1)((y_{i} - \lambda(f_{i}))^3 - 3(y_{i} - \lambda(f_{i})) \\sigma^{2} v))}{((y_{i} - \lambda(f_{i})) + \\sigma^{2} v)^3} - :param link_f: latent variables link(f) - :type link_f: Nx1 array + :param inv_link_f: latent variables link(f) + :type inv_link_f: Nx1 array :param y: data :type y: Nx1 array :param Y_metadata: Y_metadata which is not used in student t distribution :returns: third derivative of likelihood evaluated at points f :rtype: Nx1 array """ - assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape - e = y - link_f + assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape + e = y - inv_link_f d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) / ((e**2 + self.sigma2*self.v)**3) ) return d3lik_dlink3 - def dlogpdf_link_dvar(self, link_f, y, Y_metadata=None): + def dlogpdf_link_dvar(self, inv_link_f, y, Y_metadata=None): """ Gradient of the log-likelihood function at y given f, w.r.t variance parameter (t_noise) .. math:: \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\sigma^{2}} = \\frac{v((y_{i} - \lambda(f_{i}))^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})} - :param link_f: latent variables link(f) - :type link_f: Nx1 array + :param inv_link_f: latent variables link(f) + :type inv_link_f: Nx1 array :param y: data :type y: Nx1 array :param Y_metadata: Y_metadata which is not used in student t distribution :returns: derivative of likelihood evaluated at points f w.r.t variance parameter :rtype: float """ - assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape - e = y - link_f + assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape + e = y - inv_link_f dlogpdf_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2)) return np.sum(dlogpdf_dvar) - def dlogpdf_dlink_dvar(self, link_f, y, Y_metadata=None): + def dlogpdf_dlink_dvar(self, inv_link_f, y, Y_metadata=None): """ Derivative of the dlogpdf_dlink w.r.t variance parameter (t_noise) .. math:: \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^2 + \\sigma^2 v)^2} - :param link_f: latent variables link_f - :type link_f: Nx1 array + :param inv_link_f: latent variables inv_link_f + :type inv_link_f: Nx1 array :param y: data :type y: Nx1 array :param Y_metadata: Y_metadata which is not used in student t distribution :returns: derivative of likelihood evaluated at points f w.r.t variance parameter :rtype: Nx1 array """ - assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape - e = y - link_f + assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape + e = y - inv_link_f dlogpdf_dlink_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2) return dlogpdf_dlink_dvar - def d2logpdf_dlink2_dvar(self, link_f, y, Y_metadata=None): + def d2logpdf_dlink2_dvar(self, inv_link_f, y, Y_metadata=None): """ Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (t_noise) .. math:: \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}f}) = \\frac{v(v+1)(\\sigma^{2}v - 3(y_{i} - \lambda(f_{i}))^{2})}{(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})^{3}} - :param link_f: latent variables link(f) - :type link_f: Nx1 array + :param inv_link_f: latent variables link(f) + :type inv_link_f: Nx1 array :param y: data :type y: Nx1 array :param Y_metadata: Y_metadata which is not used in student t distribution :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter :rtype: Nx1 array """ - assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape - e = y - link_f + assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape + e = y - inv_link_f d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2))) / ((self.sigma2*self.v + (e**2))**3) ) @@ -246,11 +246,12 @@ class StudentT(Likelihood): return np.hstack((d2logpdf_dlink2_dvar, d2logpdf_dlink2_dv)) def predictive_mean(self, mu, sigma, Y_metadata=None): - return self.gp_link.transf(mu) # only true in link is monotoci, which it is. + # The comment here confuses mean and median. + return self.gp_link.transf(mu) # only true if link is monotonic, which it is. def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None): - if self.deg_free <2.: - return np.empty(mu.shape)*np.nan #not defined for small degress fo freedom + if self.deg_free<=2.: + return np.empty(mu.shape)*np.nan # does not exist for degrees of freedom <= 2. else: return super(StudentT, self).predictive_variance(mu, variance, predictive_mean, Y_metadata)