diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py index 2b0a201d..ae9d8eb8 100644 --- a/GPy/examples/classification.py +++ b/GPy/examples/classification.py @@ -96,15 +96,11 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot= # Optimize if optimize: - #m.update_likelihood_approximation() - # Parameters optimization: try: m.optimize('scg', messages=1) except Exception as e: return m - #m.pseudo_EM() - # Plot if plot: fig, axes = pb.subplots(2, 1) @@ -133,10 +129,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti # Optimize if optimize: - #m.update_likelihood_approximation() - # Parameters optimization: - #m.optimize() - m.pseudo_EM() + m.optimize() # Plot if plot: diff --git a/GPy/inference/latent_function_inference/expectation_propagation.py b/GPy/inference/latent_function_inference/expectation_propagation.py index 870f2310..de8b0931 100644 --- a/GPy/inference/latent_function_inference/expectation_propagation.py +++ b/GPy/inference/latent_function_inference/expectation_propagation.py @@ -24,6 +24,13 @@ class EP(LatentFunctionInference): self.old_mutilde, self.old_vtilde = None, None self._ep_approximation = None + def on_optimization_start(self): + self._ep_approximation = None + + def on_optimization_end(self): + # TODO: update approximation in the end as well? Maybe even with a switch? + pass + def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None): num_data, output_dim = X.shape assert output_dim ==1, "ep in 1D only (for now!)" @@ -47,8 +54,6 @@ class EP(LatentFunctionInference): return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL} - - def expectation_propagation(self, K, Y, likelihood, Y_metadata): num_data, data_dim = Y.shape @@ -113,4 +118,3 @@ class EP(LatentFunctionInference): mu_tilde = v_tilde/tau_tilde return mu, Sigma, mu_tilde, tau_tilde, Z_hat - diff --git a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py index be1efd3a..85d8cc89 100644 --- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py +++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py @@ -1,14 +1,56 @@ import numpy as np -from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs -from expectation_propagation import EP +from ...util import diag +from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify, DSYR +from ...util.misc import param_to_array +from ...core.parameterization.variational import VariationalPosterior +from . import LatentFunctionInference from posterior import Posterior log_2_pi = np.log(2*np.pi) -class EPDTC(EP): - def __init__(self, epsilon=1e-6, eta=1., delta=1.): +class EPDTC(LatentFunctionInference): + const_jitter = 1e-6 + def __init__(self, epsilon=1e-6, eta=1., delta=1., limit=1): + from ...util.caching import Cacher + self.limit = limit + self.get_trYYT = Cacher(self._get_trYYT, limit) + self.get_YYTfactor = Cacher(self._get_YYTfactor, limit) + self.epsilon, self.eta, self.delta = epsilon, eta, delta self.reset() + def set_limit(self, limit): + self.get_trYYT.limit = limit + self.get_YYTfactor.limit = limit + + def _get_trYYT(self, Y): + return param_to_array(np.sum(np.square(Y))) + + def __getstate__(self): + # has to be overridden, as Cacher objects cannot be pickled. + return self.limit + + def __setstate__(self, state): + # has to be overridden, as Cacher objects cannot be pickled. + self.limit = state + from ...util.caching import Cacher + self.get_trYYT = Cacher(self._get_trYYT, self.limit) + self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit) + + def _get_YYTfactor(self, Y): + """ + find a matrix L which satisfies LLT = YYT. + + Note that L may have fewer columns than Y. + """ + N, D = Y.shape + if (N>=D): + return param_to_array(Y) + else: + return jitchol(tdot(Y)) + + def get_VVTfactor(self, Y, prec): + return Y * prec # TODO chache this, and make it effective + def reset(self): self.old_mutilde, self.old_vtilde = None, None self._ep_approximation = None @@ -20,28 +62,131 @@ class EPDTC(EP): Kmm = kern.K(Z) Kmn = kern.K(Z,X) - Lm = jitchol(Kmm) - Lmi = dtrtrs(Lm,np.eye(Lm.shape[0]))[0] - Kmmi = np.dot(Lmi.T,Lmi) - KmmiKmn = np.dot(Kmmi,Kmn) - K = np.dot(Kmn.T,KmmiKmn) - if self._ep_approximation is None: mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata) else: mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation - Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde)) - alpha, _ = dpotrs(LW, mu_tilde, lower=1) + if isinstance(X, VariationalPosterior): + uncertain_inputs = True + psi0 = kern.psi0(Z, X) + psi1 = Kmn.T#kern.psi1(Z, X) + psi2 = kern.psi2(Z, X) + else: + uncertain_inputs = False + psi0 = kern.Kdiag(X) + psi1 = Kmn.T#kern.K(X, Z) + psi2 = None - log_marginal = 0.5*(-num_data * log_2_pi - W_logdet - np.sum(alpha * mu_tilde)) # TODO: add log Z_hat?? + #see whether we're using variational uncertain inputs - dL_dK = 0.5 * (tdot(alpha[:,None]) - Wi) + _, output_dim = Y.shape + + #see whether we've got a different noise variance for each datum + #beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6) + beta = tau_tilde + VVT_factor = beta[:,None]*mu_tilde[:,None] + trYYT = self.get_trYYT(mu_tilde[:,None]) + + # do the inference: + het_noise = beta.size > 1 + num_inducing = Z.shape[0] + num_data = Y.shape[0] + # kernel computations, using BGPLVM notation + + Kmm = kern.K(Z).copy() + diag.add(Kmm, self.const_jitter) + Lm = jitchol(Kmm) + + # The rather complex computations of A + if uncertain_inputs: + if het_noise: + psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0) + else: + psi2_beta = psi2.sum(0) * beta + LmInv = dtrtri(Lm) + A = LmInv.dot(psi2_beta.dot(LmInv.T)) + else: + if het_noise: + tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1))) + else: + tmp = psi1 * (np.sqrt(beta)) + tmp, _ = dtrtrs(Lm, tmp.T, lower=1) + A = tdot(tmp) #print A.sum() + + # factor B + B = np.eye(num_inducing) + A + LB = jitchol(B) + psi1Vf = np.dot(psi1.T, VVT_factor) + # back substutue C into psi1Vf + tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0) + _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0) + tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1) + Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1) + + # data fit and derivative of L w.r.t. Kmm + delit = tdot(_LBi_Lmi_psi1Vf) + data_fit = np.trace(delit) + DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit) + delit = -0.5 * DBi_plus_BiPBi + delit += -0.5 * B * output_dim + delit += output_dim * np.eye(num_inducing) + # Compute dL_dKmm + dL_dKmm = backsub_both_sides(Lm, delit) + + # derivatives of L w.r.t. psi + dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, + VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, + psi1, het_noise, uncertain_inputs) + + # log marginal likelihood + log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, + psi0, A, LB, trYYT, data_fit, VVT_factor) + + #put the gradients in the right places + dL_dR = _compute_dL_dR(likelihood, + het_noise, uncertain_inputs, LB, + _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, + psi0, psi1, beta, + data_fit, num_data, output_dim, trYYT, mu_tilde[:,None]) + + dL_dthetaL = 0#likelihood.exact_inference_gradients(dL_dR,Y_metadata) + + if uncertain_inputs: + grad_dict = {'dL_dKmm': dL_dKmm, + 'dL_dpsi0':dL_dpsi0, + 'dL_dpsi1':dL_dpsi1, + 'dL_dpsi2':dL_dpsi2, + 'dL_dthetaL':dL_dthetaL} + else: + grad_dict = {'dL_dKmm': dL_dKmm, + 'dL_dKdiag':dL_dpsi0, + 'dL_dKnm':dL_dpsi1, + 'dL_dthetaL':dL_dthetaL} + + #get sufficient things for posterior prediction + #TODO: do we really want to do this in the loop? + if VVT_factor.shape[1] == Y.shape[1]: + woodbury_vector = Cpsi1Vf # == Cpsi1V + else: + print 'foobar' + psi1V = np.dot(mu_tilde[:,None].T*beta, psi1).T + tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0) + tmp, _ = dpotrs(LB, tmp, lower=1) + woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1) + Bi, _ = dpotri(LB, lower=1) + symmetrify(Bi) + Bi = -dpotri(LB, lower=1)[0] + diag.add(Bi, 1) + + woodbury_inv = backsub_both_sides(Lm, Bi) + + #construct a posterior object + post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm) + return post, log_marginal, grad_dict - dL_dthetaL = np.zeros(likelihood.size)#TODO: derivatives of the likelihood parameters - return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL} @@ -129,3 +274,69 @@ class EPDTC(EP): mu_tilde = v_tilde/tau_tilde return mu, Sigma, mu_tilde, tau_tilde, Z_hat + +def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs): + dL_dpsi0 = -0.5 * output_dim * (beta[:,None] * np.ones([num_data, 1])).flatten() + dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T) + dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi) + if het_noise: + if uncertain_inputs: + dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :] + else: + dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta.reshape(num_data, 1)).T).T + dL_dpsi2 = None + else: + dL_dpsi2 = beta * dL_dpsi2_beta + if uncertain_inputs: + # repeat for each of the N psi_2 matrices + dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0) + else: + # subsume back into psi1 (==Kmn) + dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2) + dL_dpsi2 = None + + return dL_dpsi0, dL_dpsi1, dL_dpsi2 + + +def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT, Y): + # the partial derivative vector for the likelihood + if likelihood.size == 0: + # save computation here. + dL_dR = None + elif het_noise: + if uncertain_inputs: + raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented" + else: + #from ...util.linalg import chol_inv + #LBi = chol_inv(LB) + LBi, _ = dtrtrs(LB,np.eye(LB.shape[0])) + + Lmi_psi1, nil = dtrtrs(Lm, psi1.T, lower=1, trans=0) + _LBi_Lmi_psi1, _ = dtrtrs(LB, Lmi_psi1, lower=1, trans=0) + + dL_dR = -0.5 * beta + 0.5 * (beta*Y)**2 + dL_dR += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2 + + dL_dR += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2 + + dL_dR += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * Y * beta**2 + dL_dR += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2 + else: + # likelihood is not heteroscedatic + dL_dR = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2 + dL_dR += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta) + dL_dR += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit) + return dL_dR + +def _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT, data_fit,Y): + #compute log marginal likelihood + if het_noise: + lik_1 = -0.5 * num_data * output_dim * np.log(2. * np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(beta * np.square(Y).sum(axis=-1)) + lik_2 = -0.5 * output_dim * (np.sum(beta.flatten() * psi0) - np.trace(A)) + else: + lik_1 = -0.5 * num_data * output_dim * (np.log(2. * np.pi) - np.log(beta)) - 0.5 * beta * trYYT + lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A)) + lik_3 = -output_dim * (np.sum(np.log(np.diag(LB)))) + lik_4 = 0.5 * data_fit + log_marginal = lik_1 + lik_2 + lik_3 + lik_4 + return log_marginal diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py index 7b867954..6c53958b 100644 --- a/GPy/likelihoods/bernoulli.py +++ b/GPy/likelihoods/bernoulli.py @@ -227,3 +227,6 @@ class Bernoulli(Likelihood): ns = np.ones_like(gp, dtype=int) Ysim = np.random.binomial(ns, self.gp_link.transf(gp)) return Ysim.reshape(orig_shape) + + def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None): + pass