diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py index 97d8dfe3..e50daa24 100644 --- a/GPy/inference/latent_function_inference/var_dtc.py +++ b/GPy/inference/latent_function_inference/var_dtc.py @@ -64,9 +64,7 @@ class VarDTC(LatentFunctionInference): def get_VVTfactor(self, Y, prec): return Y * prec # TODO chache this, and make it effective - - - def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None): + def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, psi0=None, psi1=None, psi2=None): _, output_dim = Y.shape uncertain_inputs = isinstance(X, VariationalPosterior) @@ -95,17 +93,28 @@ class VarDTC(LatentFunctionInference): # The rather complex computations of A, and the psi stats if uncertain_inputs: - psi0 = kern.psi0(Z, X) - psi1 = kern.psi1(Z, X) + if psi0 is None: + psi0 = kern.psi0(Z, X) + if psi1 is None: + psi1 = kern.psi1(Z, X) if het_noise: - psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0) + if psi2 is None: + assert len(psi2.shape) == 3 # Need to have not summed out N + #FIXME: Need testing + psi2_beta = np.sum([psi2[X[i:i+1,:], :, :] * beta_i for i,beta_i in enumerate(beta)],0) + else: + psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0) else: - psi2_beta = kern.psi2(Z,X) * beta + if psi2 is None: + psi2 = kern.psi2(Z,X) + psi2_beta = psi2 * beta LmInv = dtrtri(Lm) A = LmInv.dot(psi2_beta.dot(LmInv.T)) else: - psi0 = kern.Kdiag(X) - psi1 = kern.K(X, Z) + if psi0 is None: + psi0 = kern.Kdiag(X) + if psi1 is None: + psi1 = kern.K(X, Z) if het_noise: tmp = psi1 * (np.sqrt(beta)) else: diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py index 5af3fb1c..fb12030b 100644 --- a/GPy/kern/_src/kern.py +++ b/GPy/kern/_src/kern.py @@ -59,7 +59,12 @@ class Kern(Parameterized): self._sliced_X = 0 self.useGPU = self._support_GPU and useGPU self._return_psi2_n_flag = ObsAr(np.zeros(1)).astype(bool) - + + #FIXME: temporary solution + from ...core.parameterization.lists_and_dicts import ObserverList + self._return_psi2_n_flag.observers = ObserverList() + self._return_psi2_n_flag._update_on = True + from .psi_comp import PSICOMP_GH self.psicomp = PSICOMP_GH() @@ -97,7 +102,7 @@ class Kern(Parameterized): def psi1(self, Z, variational_posterior): return self.psicomp.psicomputations(self, Z, variational_posterior)[1] def psi2(self, Z, variational_posterior): - return self.psicomp.psicomputations(self, Z, variational_posterior)[2] + return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=self.return_psi2_n)[2] def gradients_X(self, dL_dK, X, X2): raise NotImplementedError def gradients_X_diag(self, dL_dKdiag, X): @@ -111,7 +116,8 @@ class Kern(Parameterized): """Set the gradients of all parameters when doing full (N) inference.""" raise NotImplementedError - def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, + psi0=None, psi1=None, psi2=None): """ Set the gradients of all parameters when doing inference with uncertain inputs, using expectations of the kernel. @@ -122,22 +128,27 @@ class Kern(Parameterized): dL_dpsi1 * dpsi1_d{theta_i} + dL_dpsi2 * dpsi2_d{theta_i} """ - dtheta = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[0] + dtheta = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, + psi0=psi0, psi1=psi1, psi2=psi2)[0] self.gradient[:] = dtheta - def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, + psi0=None, psi1=None, psi2=None): """ Returns the derivative of the objective wrt Z, using the chain rule through the expectation variables. """ - return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[1] + return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, + psi0=psi0, psi1=psi1, psi2=psi2)[1] - def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, + psi0=None, psi1=None, psi2=None): """ Compute the gradients wrt the parameters of the variational distruibution q(X), chain-ruling via the expectations of the kernel """ - return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[2:] + return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, + psi0=psi0, psi1=psi1, psi2=psi2)[2:] def plot(self, x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs): """ diff --git a/GPy/kern/_src/kernel_slice_operations.py b/GPy/kern/_src/kernel_slice_operations.py index 3473ffce..a2058ca8 100644 --- a/GPy/kern/_src/kernel_slice_operations.py +++ b/GPy/kern/_src/kernel_slice_operations.py @@ -116,25 +116,31 @@ def _slice_psi(f): def _slice_update_gradients_expectations(f): @wraps(f) - def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, + psi0=None, psi1=None, psi2=None): with _Slice_wrap(self, Z, variational_posterior) as s: - ret = f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2) + ret = f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2, + psi0=psi0, psi1=psi1, psi2=psi2) return ret return wrap def _slice_gradients_Z_expectations(f): @wraps(f) - def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, + psi0=None, psi1=None, psi2=None): with _Slice_wrap(self, Z, variational_posterior) as s: - ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2)) + ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2, + psi0=psi0, psi1=psi1, psi2=psi2)) return ret return wrap def _slice_gradients_qX_expectations(f): @wraps(f) - def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, + psi0=None, psi1=None, psi2=None): with _Slice_wrap(self, variational_posterior, Z) as s: - ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X)) + ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X, + psi0=psi0, psi1=psi1, psi2=psi2)) r2 = ret[:2] ret[0] = s.handle_return_array(r2[0]) ret[1] = s.handle_return_array(r2[1]) diff --git a/GPy/kern/_src/psi_comp/__init__.py b/GPy/kern/_src/psi_comp/__init__.py index 6d7222be..6849ed81 100644 --- a/GPy/kern/_src/psi_comp/__init__.py +++ b/GPy/kern/_src/psi_comp/__init__.py @@ -12,18 +12,22 @@ from .gaussherm import PSICOMP_GH class PSICOMP_RBF(Pickleable): @Cache_this(limit=10, ignore_args=(0,)) - def psicomputations(self, variance, lengthscale, Z, variational_posterior): + def psicomputations(self, variance, lengthscale, Z, variational_posterior, return_psi2_n=False): if isinstance(variational_posterior, variational.NormalPosterior): - return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior) + return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior, return_psi2_n=return_psi2_n) elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + if return_psi2_n: + raise NotImplementedError('However this function seems to return it by default') return ssrbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior) else: raise ValueError("unknown distriubtion received for psi-statistics") @Cache_this(limit=10, ignore_args=(0,1,2,3)) - def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): + def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior, + psi0=None, psi1=None, psi2=None): if isinstance(variational_posterior, variational.NormalPosterior): - return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior) + return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior, + psi0=psi0, psi1=psi1, psi2=psi2) elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior): return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior) else: @@ -35,7 +39,9 @@ class PSICOMP_RBF(Pickleable): class PSICOMP_Linear(Pickleable): @Cache_this(limit=10, ignore_args=(0,)) - def psicomputations(self, variance, Z, variational_posterior): + def psicomputations(self, variance, Z, variational_posterior, return_psi2_n=False): + if return_psi2_n: + raise NotImplementedError if isinstance(variational_posterior, variational.NormalPosterior): return linear_psi_comp.psicomputations(variance, Z, variational_posterior) elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior): @@ -44,8 +50,9 @@ class PSICOMP_Linear(Pickleable): raise ValueError("unknown distriubtion received for psi-statistics") @Cache_this(limit=10, ignore_args=(0,1,2,3)) - def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior): + def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior, psi0=None, psi1=None, psi2=None): if isinstance(variational_posterior, variational.NormalPosterior): + #Should pass psi in return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior) elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior): return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior) @@ -53,4 +60,4 @@ class PSICOMP_Linear(Pickleable): raise ValueError("unknown distriubtion received for psi-statistics") def _setup_observers(self): - pass \ No newline at end of file + pass diff --git a/GPy/kern/_src/psi_comp/linear_psi_comp.py b/GPy/kern/_src/psi_comp/linear_psi_comp.py index 50090428..cd5d8fa0 100644 --- a/GPy/kern/_src/psi_comp/linear_psi_comp.py +++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py @@ -8,7 +8,7 @@ The package for the Psi statistics computation of the linear kernel for Bayesian import numpy as np from ....util.linalg import tdot -def psicomputations(variance, Z, variational_posterior): +def psicomputations(variance, Z, variational_posterior, return_psi2_n=False): """ Compute psi-statistics for ss-linear kernel """ @@ -22,7 +22,10 @@ def psicomputations(variance, Z, variational_posterior): psi0 = (variance*(np.square(mu)+S)).sum(axis=1) psi1 = np.dot(mu,(variance*Z).T) - psi2 = np.dot(S.sum(axis=0)*np.square(variance)*Z,Z.T)+ tdot(psi1.T) + if sum_N_psi2: + psi2 = np.dot(S.sum(axis=0)*np.square(variance)*Z,Z.T)+ tdot(psi1.T) + else: + raise NotImplementedError return psi0, psi1, psi2 @@ -40,7 +43,7 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variati dL_dmu += 2.*dL_dpsi0_var*mu+np.dot(dL_dpsi1,Z)*variance dL_dS += dL_dpsi0_var dL_dZ += dL_dpsi1_mu*variance - + return dL_dvar, dL_dZ, dL_dmu, dL_dS def _psi2computations(dL_dpsi2, variance, Z, mu, S): @@ -56,7 +59,7 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S): # _psi2_dZ MxQ # _psi2_dmu NxQ # _psi2_dS NxQ - + variance2 = np.square(variance) common_sum = np.dot(mu,(variance*Z).T) Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0) @@ -66,12 +69,12 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S): Z1_expect = np.dot(dL_dpsi2T,Z) dL_dvar = 2.*S.sum(axis=0)*variance*Z_expect+(common_expect*mu).sum(axis=0) - + dL_dmu = common_expect*variance - + dL_dS = np.empty(S.shape) dL_dS[:] = Z_expect*variance2 - + dL_dZ = variance2*S.sum(axis=0)*Z1_expect+np.dot(Z2_expect.T,variance*mu) return dL_dvar, dL_dmu, dL_dS, dL_dZ diff --git a/GPy/kern/_src/psi_comp/rbf_psi_comp.py b/GPy/kern/_src/psi_comp/rbf_psi_comp.py index 16bada72..de958a37 100644 --- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py +++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py @@ -5,7 +5,7 @@ The module for psi-statistics for RBF kernel import numpy as np from GPy.util.caching import Cacher -def psicomputations(variance, lengthscale, Z, variational_posterior): +def psicomputations(variance, lengthscale, Z, variational_posterior, return_psi2_n=False): """ Z - MxQ mu - NxQ @@ -21,7 +21,9 @@ def psicomputations(variance, lengthscale, Z, variational_posterior): psi0 = np.empty(mu.shape[0]) psi0[:] = variance psi1 = _psi1computations(variance, lengthscale, Z, mu, S) - psi2 = _psi2computations(variance, lengthscale, Z, mu, S).sum(axis=0) + psi2 = _psi2computations(variance, lengthscale, Z, mu, S) + if not return_psi2_n: + psi2 = psi2.sum(axis=0) return psi0, psi1, psi2 def __psi1computations(variance, lengthscale, Z, mu, S): @@ -66,11 +68,12 @@ def __psi2computations(variance, lengthscale, Z, mu, S): _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2) return _psi2 -def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): +def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior, + psi0=None, psi1=None, psi2=None): ARD = (len(lengthscale)!=1) - dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance) - dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance) + dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi1=psi1) + dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi2=psi2) dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2 @@ -84,7 +87,7 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscal return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS -def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S): +def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None): """ dL_dpsi1 - NxM Z - MxQ @@ -103,8 +106,9 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S): lengthscale2 = np.square(lengthscale) - _psi1 = _psi1computations(variance, lengthscale, Z, mu, S) - Lpsi1 = dL_dpsi1*_psi1 + if psi1 is None: + psi1 = _psi1computations(variance, lengthscale, Z, mu, S) + Lpsi1 = dL_dpsi1*psi1 Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ denom = 1./(S+lengthscale2) Zmu2_denom = np.square(Zmu)*denom[:,None,:] #NxMxQ @@ -116,7 +120,7 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S): return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS -def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S): +def __psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, psi2=None): """ Z - MxQ mu - NxQ @@ -137,8 +141,9 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S): denom = 1./(2*S+lengthscale2) denom2 = np.square(denom) - _psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM - Lpsi2 = dL_dpsi2*_psi2 # dL_dpsi2 is MxM, using broadcast to multiply N out + if psi2 is None: + psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM + Lpsi2 = dL_dpsi2*psi2 # dL_dpsi2 is MxM, using broadcast to multiply N out Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ Lpsi2Z2 = np.einsum('nmo,oq,oq->nq',Lpsi2,Z,Z) #NxQ @@ -158,3 +163,5 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S): _psi1computations = Cacher(__psi1computations, limit=5) _psi2computations = Cacher(__psi2computations, limit=5) +_psi1compDer = Cacher(__psi1compDer, limit=5) +_psi2compDer = Cacher(__psi2compDer, limit=5) diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py index f6a24c86..10ea95e4 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py @@ -9,7 +9,7 @@ import numpy as np try: from scipy import weave - + def _psicomputations(variance, lengthscale, Z, variational_posterior): """ Z - MxQ @@ -23,7 +23,7 @@ try: mu = variational_posterior.mean S = variational_posterior.variance gamma = variational_posterior.binary_prob - + N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1] l2 = np.square(lengthscale) log_denom1 = np.log(S/l2+1) @@ -35,13 +35,13 @@ try: psi0[:] = variance psi1 = np.empty((N,M)) psi2n = np.empty((N,M,M)) - + from ....util.misc import param_to_array S = param_to_array(S) mu = param_to_array(mu) gamma = param_to_array(gamma) Z = param_to_array(Z) - + support_code = """ #include """ @@ -56,11 +56,11 @@ try: double lq = l2(q); double Zm1q = Z(m1,q); double Zm2q = Z(m2,q); - + if(m2==0) { // Compute Psi_1 double muZ = mu(n,q)-Z(m1,q); - + double psi1_exp1 = log_gamma(n,q) - (muZ*muZ/(Snq+lq) +log_denom1(n,q))/2.; double psi1_exp2 = log_gamma1(n,q) -Zm1q*Zm1q/(2.*lq); log_psi1 += (psi1_exp1>psi1_exp2)?psi1_exp1+log1p(exp(psi1_exp2-psi1_exp1)):psi1_exp2+log1p(exp(psi1_exp1-psi1_exp2)); @@ -69,10 +69,10 @@ try: double muZhat = mu(n,q) - (Zm1q+Zm2q)/2.; double Z2 = Zm1q*Zm1q+ Zm2q*Zm2q; double dZ = Zm1q - Zm2q; - + double psi2_exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q); double psi2_exp2 = log_gamma1(n,q) - Z2/(2.*lq); - log_psi2_n += (psi2_exp1>psi2_exp2)?psi2_exp1+log1p(exp(psi2_exp2-psi2_exp1)):psi2_exp2+log1p(exp(psi2_exp1-psi2_exp2)); + log_psi2_n += (psi2_exp1>psi2_exp2)?psi2_exp1+log1p(exp(psi2_exp2-psi2_exp1)):psi2_exp2+log1p(exp(psi2_exp1-psi2_exp2)); } double exp_psi2_n = exp(log_psi2_n); psi2n(n,m1,m2) = variance*variance*exp_psi2_n; @@ -83,18 +83,18 @@ try: } """ weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz) - + psi2 = psi2n.sum(axis=0) return psi0,psi1,psi2,psi2n - + from GPy.util.caching import Cacher psicomputations = Cacher(_psicomputations, limit=1) - + def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): ARD = (len(lengthscale)!=1) - + _,psi1,_,psi2n = psicomputations(variance, lengthscale, Z, variational_posterior) - + mu = variational_posterior.mean S = variational_posterior.variance gamma = variational_posterior.binary_prob @@ -105,7 +105,7 @@ try: log_gamma = np.log(gamma) log_gamma1 = np.log(1.-gamma) variance = float(variance) - + dvar = np.zeros(1) dmu = np.zeros((N,Q)) dS = np.zeros((N,Q)) @@ -113,13 +113,13 @@ try: dl = np.zeros(Q) dZ = np.zeros((M,Q)) dvar += np.sum(dL_dpsi0) - + from ....util.misc import param_to_array S = param_to_array(S) mu = param_to_array(mu) gamma = param_to_array(gamma) Z = param_to_array(Z) - + support_code = """ #include """ @@ -136,16 +136,16 @@ try: double Zm2q = Z(m2,q); double gnq = gamma(n,q); double mu_nq = mu(n,q); - + if(m2==0) { - // Compute Psi_1 + // Compute Psi_1 double lpsi1 = psi1(n,m1)*dL_dpsi1(n,m1); if(q==0) {dvar(0) += lpsi1/variance;} - + double Zmu = Zm1q - mu_nq; double denom = Snq+lq; double Zmu2_denom = Zmu*Zmu/denom; - + double exp1 = log_gamma(n,q)-(Zmu*Zmu/(Snq+lq)+log_denom1(n,q))/(2.); double exp2 = log_gamma1(n,q)-Zm1q*Zm1q/(2.*lq); double d_exp1,d_exp2; @@ -157,7 +157,7 @@ try: d_exp2 = 1.; } double exp_sum = d_exp1+d_exp2; - + dmu(n,q) += lpsi1*Zmu*d_exp1/(denom*exp_sum); dS(n,q) += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum)/2.; dgamma(n,q) += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum; @@ -167,13 +167,13 @@ try: // Compute Psi_2 double lpsi2 = psi2n(n,m1,m2)*dL_dpsi2(m1,m2); if(q==0) {dvar(0) += lpsi2*2/variance;} - + double dZm1m2 = Zm1q - Zm2q; double Z2 = Zm1q*Zm1q+Zm2q*Zm2q; double muZhat = mu_nq - (Zm1q + Zm2q)/2.; double denom = 2.*Snq+lq; double muZhat2_denom = muZhat*muZhat/denom; - + double exp1 = dZm1m2*dZm1m2/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q); double exp2 = log_gamma1(n,q) - Z2/(2.*lq); double d_exp1,d_exp2; @@ -185,23 +185,23 @@ try: d_exp2 = 1.; } double exp_sum = d_exp1+d_exp2; - + dmu(n,q) += -2.*lpsi2*muZhat/denom*d_exp1/exp_sum; dS(n,q) += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum; dgamma(n,q) += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum; dl(q) += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZm1m2*dZm1m2/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum; - dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum; + dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum; } } } } """ weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz) - + dl *= 2.*lengthscale if not ARD: dl = dl.sum() - + return dvar, dl, dZ, dmu, dS, dgamma except: @@ -219,13 +219,13 @@ except: mu = variational_posterior.mean S = variational_posterior.variance gamma = variational_posterior.binary_prob - + psi0 = np.empty(mu.shape[0]) psi0[:] = variance psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma) psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma) return psi0, psi1, psi2 - + def _psi1computations(variance, lengthscale, Z, mu, S, gamma): """ Z - MxQ @@ -236,9 +236,9 @@ except: # here are the "statistics" for psi1 # Produced intermediate results: # _psi1 NxM - + lengthscale2 = np.square(lengthscale) - + # psi1 _psi1_denom = S[:, None, :] / lengthscale2 + 1. # Nx1xQ _psi1_denom_sqrt = np.sqrt(_psi1_denom) #Nx1xQ @@ -251,9 +251,9 @@ except: _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM _psi1 = variance * np.exp(_psi1_exp_sum) # NxM - + return _psi1 - + def _psi2computations(variance, lengthscale, Z, mu, S, gamma): """ Z - MxQ @@ -264,14 +264,14 @@ except: # here are the "statistics" for psi2 # Produced intermediate results: # _psi2 MxM - + lengthscale2 = np.square(lengthscale) - + _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ - + # psi2 _psi2_denom = 2.*S[:, None, None, :] / lengthscale2 + 1. # Nx1x1xQ _psi2_denom_sqrt = np.sqrt(_psi2_denom) @@ -284,28 +284,28 @@ except: _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max)) _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM - + return _psi2 - + def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): ARD = (len(lengthscale)!=1) - + dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - + dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2 - + dL_dlengscale = dl_psi1 + dl_psi2 if not ARD: dL_dlengscale = dL_dlengscale.sum() - + dL_dgamma = dgamma_psi1 + dgamma_psi2 dL_dmu = dmu_psi1 + dmu_psi2 dL_dS = dS_psi1 + dS_psi2 dL_dZ = dZ_psi1 + dZ_psi2 - + return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma - + def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma): """ dL_dpsi1 - NxM @@ -322,9 +322,9 @@ except: # _dL_dgamma NxQ # _dL_dmu NxQ # _dL_dS NxQ - + lengthscale2 = np.square(lengthscale) - + # psi1 _psi1_denom = S / lengthscale2 + 1. # NxQ _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ @@ -346,9 +346,9 @@ except: _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2. # NxQ _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z)) _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z)) - - return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma - + + return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma + def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma): """ Z - MxQ @@ -365,14 +365,14 @@ except: # _dL_dgamma NxQ # _dL_dmu NxQ # _dL_dS NxQ - + lengthscale2 = np.square(lengthscale) - + _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ - + # psi2 _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ _psi2_denom_sqrt = np.sqrt(_psi2_denom) @@ -384,7 +384,7 @@ except: _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2) _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max)) _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM - _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ + _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM @@ -394,5 +394,5 @@ except: _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq) _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z)) _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z)) - + return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index 73f2d0a4..357c48ec 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -56,16 +56,19 @@ class RBF(Stationary): return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[1] def psi2(self, Z, variational_posterior): - return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[2] + return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior, return_psi2_n=self.return_psi2_n)[2] - def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): - dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[:2] + def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, + psi0=None, psi1=None, psi2=None): + dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[:2] self.variance.gradient = dL_dvar self.lengthscale.gradient = dL_dlengscale - def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): - return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[2] + def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, + psi0=None, psi1=None, psi2=None): + return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[2] - def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): - return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[3:] + def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, + psi0=None, psi1=None, psi2=None): + return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[3:] diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py index 7a7bb0e8..47c425a4 100644 --- a/GPy/models/bayesian_gplvm_minibatch.py +++ b/GPy/models/bayesian_gplvm_minibatch.py @@ -80,45 +80,9 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch): """Get the gradients of the posterior distribution of X in its specific form.""" return X.mean.gradient, X.variance.gradient - def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, **kw): - posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices, **kw) - - if self.has_uncertain_inputs(): - current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations( - variational_posterior=X, - Z=Z, dL_dpsi0=grad_dict['dL_dpsi0'], - dL_dpsi1=grad_dict['dL_dpsi1'], - dL_dpsi2=grad_dict['dL_dpsi2']) - else: - current_values['Xgrad'] = self.kern.gradients_X(grad_dict['dL_dKnm'], X, Z) - current_values['Xgrad'] += self.kern.gradients_X_diag(grad_dict['dL_dKdiag'], X) - if subset_indices is not None: - value_indices['Xgrad'] = subset_indices['samples'] - - kl_fctr = self.kl_factr - if self.has_uncertain_inputs(): - if self.missing_data: - d = self.output_dim - log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)/d - else: - log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X) - - # Subsetting Variational Posterior objects, makes the gradients - # empty. We need them to be 0 though: - X.mean.gradient[:] = 0 - X.variance.gradient[:] = 0 - - self.variational_prior.update_gradients_KL(X) - if self.missing_data: - current_values['meangrad'] += kl_fctr*X.mean.gradient/d - current_values['vargrad'] += kl_fctr*X.variance.gradient/d - else: - current_values['meangrad'] += kl_fctr*X.mean.gradient - current_values['vargrad'] += kl_fctr*X.variance.gradient - - if subset_indices is not None: - value_indices['meangrad'] = subset_indices['samples'] - value_indices['vargrad'] = subset_indices['samples'] + def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, psi0=None, psi1=None, psi2=None, **kw): + posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, + psi0=psi0, psi1=psi1, psi2=psi2, subset_indices=subset_indices, **kw) return posterior, log_marginal_likelihood, grad_dict, current_values, value_indices def _outer_values_update(self, full_values): @@ -127,6 +91,47 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch): E.g. set the gradients of parameters, etc. """ super(BayesianGPLVMMiniBatch, self)._outer_values_update(full_values) + current_values = full_values + grad_dict = current_values + if self.has_uncertain_inputs(): + current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations( + variational_posterior=self.X, + Z=self.Z, dL_dpsi0=grad_dict['dL_dpsi0'], + dL_dpsi1=grad_dict['dL_dpsi1'], + dL_dpsi2=grad_dict['dL_dpsi2'], + psi0=self.psi0, psi1=self.psi1, psi2=self.psi2) + else: + current_values['Xgrad'] = self.kern.gradients_X(grad_dict['dL_dKnm'], self.X, self.Z) + current_values['Xgrad'] += self.kern.gradients_X_diag(grad_dict['dL_dKdiag'], self.X) + if subset_indices is not None: + value_indices['Xgrad'] = subset_indices['samples'] + + kl_fctr = self.kl_factr + if self.has_uncertain_inputs(): + #if self.missing_data: + #d = self.output_dim + #log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)/d + #else: + self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X) + + # Subsetting Variational Posterior objects, makes the gradients + # empty. We need them to be 0 though: + self.X.mean.gradient[:] = 0 + self.X.variance.gradient[:] = 0 + + self.variational_prior.update_gradients_KL(self.X) + #if self.missing_data: + #current_values['meangrad'] += kl_fctr*self.X.mean.gradient/d + #current_values['vargrad'] += kl_fctr*self.X.variance.gradient/d + #else: + current_values['meangrad'] += kl_fctr*self.X.mean.gradient + current_values['vargrad'] += kl_fctr*self.X.variance.gradient + + #if subset_indices is not None: + #value_indices['meangrad'] = subset_indices['samples'] + #value_indices['vargrad'] = subset_indices['samples'] + + full_values = current_values if self.has_uncertain_inputs(): self.X.mean.gradient = full_values['meangrad'] self.X.variance.gradient = full_values['vargrad'] @@ -134,11 +139,13 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch): self.X.gradient = full_values['Xgrad'] def _outer_init_full_values(self): - if self.has_uncertain_inputs(): - return dict(meangrad=np.zeros(self.X.mean.shape), - vargrad=np.zeros(self.X.variance.shape)) - else: - return dict(Xgrad=np.zeros(self.X.shape)) + full_values = super(BayesianGPLVMMiniBatch, self)._outer_init_full_values() + #if self.has_uncertain_inputs(): + #return dict(meangrad=np.zeros(self.X.mean.shape), + #vargrad=np.zeros(self.X.variance.shape)) + #else: + #return dict(Xgrad=np.zeros(self.X.shape)) + return full_values def parameters_changed(self): super(BayesianGPLVMMiniBatch,self).parameters_changed() diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py index 07295255..f8322b4e 100644 --- a/GPy/models/sparse_gp_minibatch.py +++ b/GPy/models/sparse_gp_minibatch.py @@ -44,7 +44,7 @@ class SparseGPMiniBatch(SparseGP): def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, name='sparse gp', Y_metadata=None, normalizer=False, missing_data=False, stochastic=False, batchsize=1): - + # pick a sensible inference method if inference_method is None: if isinstance(likelihood, likelihoods.Gaussian): @@ -80,7 +80,7 @@ class SparseGPMiniBatch(SparseGP): def has_uncertain_inputs(self): return isinstance(self.X, VariationalPosterior) - def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, **kwargs): + def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, psi0=None, psi1=None, psi2=None, missing_inds=None, **kwargs): """ This is the standard part, which usually belongs in parameters_changed. @@ -99,46 +99,22 @@ class SparseGPMiniBatch(SparseGP): like them into this dictionary for inner use of the indices inside the algorithm. """ - try: - posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None, **kwargs) - except: - posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata) + if psi2 is None: + psi2_sum_n = None + else: + psi2_sum_n = psi2.sum(axis=0) + posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None, psi0=psi0, psi1=psi1, psi2=psi2_sum_n, **kwargs) current_values = {} likelihood.update_gradients(grad_dict['dL_dthetaL']) current_values['likgrad'] = likelihood.gradient.copy() if subset_indices is None: subset_indices = {} - if isinstance(X, VariationalPosterior): - #gradients wrt kernel - dL_dKmm = grad_dict['dL_dKmm'] - kern.update_gradients_full(dL_dKmm, Z, None) - current_values['kerngrad'] = kern.gradient.copy() - kern.update_gradients_expectations(variational_posterior=X, - Z=Z, - dL_dpsi0=grad_dict['dL_dpsi0'], - dL_dpsi1=grad_dict['dL_dpsi1'], - dL_dpsi2=grad_dict['dL_dpsi2']) - current_values['kerngrad'] += kern.gradient + current_values['dL_dpsi0'] = grad_dict['dL_dpsi0'] + current_values['dL_dpsi1'] = grad_dict['dL_dpsi1'] + current_values['dL_dpsi2'] = grad_dict['dL_dpsi2'] + current_values['dL_dKmm'] = grad_dict['dL_dKmm'] - #gradients wrt Z - current_values['Zgrad'] = kern.gradients_X(dL_dKmm, Z) - current_values['Zgrad'] += kern.gradients_Z_expectations( - grad_dict['dL_dpsi0'], - grad_dict['dL_dpsi1'], - grad_dict['dL_dpsi2'], - Z=Z, - variational_posterior=X) - else: - #gradients wrt kernel - kern.update_gradients_diag(grad_dict['dL_dKdiag'], X) - current_values['kerngrad'] = kern.gradient.copy() - kern.update_gradients_full(grad_dict['dL_dKnm'], X, Z) - current_values['kerngrad'] += kern.gradient - kern.update_gradients_full(grad_dict['dL_dKmm'], Z, None) - current_values['kerngrad'] += kern.gradient - #gradients wrt Z - current_values['Zgrad'] = kern.gradients_X(grad_dict['dL_dKmm'], Z) - current_values['Zgrad'] += kern.gradients_X(grad_dict['dL_dKnm'].T, Z, X) + #current_values = grad_dict return posterior, log_marginal_likelihood, grad_dict, current_values, subset_indices def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None): @@ -192,9 +168,43 @@ class SparseGPMiniBatch(SparseGP): Here you put the values, which were collected before in the right places. E.g. set the gradients of parameters, etc. """ - self.likelihood.gradient = full_values['likgrad'] - self.kern.gradient = full_values['kerngrad'] - self.Z.gradient = full_values['Zgrad'] + grad_dict = full_values + current_values = full_values + #current_values = {} + if isinstance(self.X, VariationalPosterior): + #gradients wrt kernel + dL_dKmm = grad_dict['dL_dKmm'] + self.kern.update_gradients_full(dL_dKmm, self.Z, None) + current_values['kerngrad'] = self.kern.gradient.copy() + self.kern.update_gradients_expectations(variational_posterior=self.X, + Z=self.Z, + dL_dpsi0=grad_dict['dL_dpsi0'], + dL_dpsi1=grad_dict['dL_dpsi1'], + dL_dpsi2=grad_dict['dL_dpsi2']) + current_values['kerngrad'] += self.kern.gradient + + #gradients wrt Z + current_values['Zgrad'] = self.kern.gradients_X(dL_dKmm, self.Z) + current_values['Zgrad'] += self.kern.gradients_Z_expectations( + grad_dict['dL_dpsi0'], + grad_dict['dL_dpsi1'], + grad_dict['dL_dpsi2'], + Z=self.Z, + variational_posterior=self.X) + else: + #gradients wrt kernel + kern.update_gradients_diag(grad_dict['dL_dKdiag'], self.X) + current_values['kerngrad'] = self.kern.gradient.copy() + kern.update_gradients_full(grad_dict['dL_dKnm'], self.X, self.Z) + current_values['kerngrad'] += kern.gradient + kern.update_gradients_full(grad_dict['dL_dKmm'], self.Z, None) + current_values['kerngrad'] += kern.gradient + #gradients wrt Z + current_values['Zgrad'] = kern.gradients_X(grad_dict['dL_dKmm'], self.Z) + current_values['Zgrad'] += kern.gradients_X(grad_dict['dL_dKnm'].T, self.Z, self.X) + self.likelihood.gradient = current_values['likgrad'] + self.kern.gradient = current_values['kerngrad'] + self.Z.gradient = current_values['Zgrad'] def _outer_init_full_values(self): """ @@ -209,7 +219,8 @@ class SparseGPMiniBatch(SparseGP): to initialize the gradients for the mean and the variance in order to have the full gradient for indexing) """ - return {} + return {'dL_dpsi0': np.zeros(self.X.shape[0]), + 'dL_dpsi1': np.zeros((self.X.shape[0], self.Z.shape[0]))} def _outer_loop_for_missing_data(self): Lm = None @@ -230,20 +241,35 @@ class SparseGPMiniBatch(SparseGP): message = m_f(-1) print(message, end=' ') - for d, ninan in self.stochastics.d: + #Compute the psi statistics for N once, but don't sum out N in psi2 + self.kern.return_psi2_n = True + psi0 = self.kern.psi0(self.Z, self.X) + psi1 = self.kern.psi1(self.Z, self.X) + psi2 = self.kern.psi2(self.Z, self.X) + self.psi0 = psi0 + self.psi1 = psi1 + self.psi2 = psi2 + for d, ninan in self.stochastics.d: if not self.stochastics: print(' '*(len(message)) + '\r', end=' ') message = m_f(d) print(message, end=' ') + psi0ni = psi0[ninan] + psi1ni = psi1[ninan] + psi2ni = psi2[ninan] + posterior, log_marginal_likelihood, \ grad_dict, current_values, value_indices = self._inner_parameters_changed( self.kern, self.X[ninan], self.Z, self.likelihood, self.Y_normalized[ninan][:, d], self.Y_metadata, Lm, dL_dKmm, - subset_indices=dict(outputs=d, samples=ninan)) + subset_indices=dict(outputs=d, samples=ninan, + dL_dpsi0=ninan, + dL_dpsi1=ninan), + psi0=psi0ni, psi1=psi1ni, psi2=psi2ni) self._inner_take_over_or_update(self.full_values, current_values, value_indices) self._inner_values_update(current_values) @@ -253,6 +279,7 @@ class SparseGPMiniBatch(SparseGP): woodbury_inv[:, :, d] = posterior.woodbury_inv[:,:,None] woodbury_vector[:, d] = posterior.woodbury_vector self._log_marginal_likelihood += log_marginal_likelihood + if not self.stochastics: print('')