diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py index fb12030b..78488628 100644 --- a/GPy/kern/_src/kern.py +++ b/GPy/kern/_src/kern.py @@ -117,7 +117,7 @@ class Kern(Parameterized): raise NotImplementedError def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, - psi0=None, psi1=None, psi2=None): + psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None): """ Set the gradients of all parameters when doing inference with uncertain inputs, using expectations of the kernel. @@ -129,26 +129,26 @@ class Kern(Parameterized): dL_dpsi2 * dpsi2_d{theta_i} """ dtheta = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, - psi0=psi0, psi1=psi1, psi2=psi2)[0] + psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[0] self.gradient[:] = dtheta def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, - psi0=None, psi1=None, psi2=None): + psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None): """ Returns the derivative of the objective wrt Z, using the chain rule through the expectation variables. """ return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, - psi0=psi0, psi1=psi1, psi2=psi2)[1] + psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[1] def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, - psi0=None, psi1=None, psi2=None): + psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None): """ Compute the gradients wrt the parameters of the variational distruibution q(X), chain-ruling via the expectations of the kernel """ return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, - psi0=psi0, psi1=psi1, psi2=psi2)[2:] + psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[2:] def plot(self, x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs): """ diff --git a/GPy/kern/_src/kernel_slice_operations.py b/GPy/kern/_src/kernel_slice_operations.py index a2058ca8..49899512 100644 --- a/GPy/kern/_src/kernel_slice_operations.py +++ b/GPy/kern/_src/kernel_slice_operations.py @@ -117,30 +117,30 @@ def _slice_psi(f): def _slice_update_gradients_expectations(f): @wraps(f) def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, - psi0=None, psi1=None, psi2=None): + psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None): with _Slice_wrap(self, Z, variational_posterior) as s: ret = f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2, - psi0=psi0, psi1=psi1, psi2=psi2) + psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2) return ret return wrap def _slice_gradients_Z_expectations(f): @wraps(f) def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, - psi0=None, psi1=None, psi2=None): + psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None): with _Slice_wrap(self, Z, variational_posterior) as s: ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2, - psi0=psi0, psi1=psi1, psi2=psi2)) + psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)) return ret return wrap def _slice_gradients_qX_expectations(f): @wraps(f) def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, - psi0=None, psi1=None, psi2=None): + psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None): with _Slice_wrap(self, variational_posterior, Z) as s: ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X, - psi0=psi0, psi1=psi1, psi2=psi2)) + psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)) r2 = ret[:2] ret[0] = s.handle_return_array(r2[0]) ret[1] = s.handle_return_array(r2[1]) diff --git a/GPy/kern/_src/psi_comp/__init__.py b/GPy/kern/_src/psi_comp/__init__.py index 6849ed81..088b514c 100644 --- a/GPy/kern/_src/psi_comp/__init__.py +++ b/GPy/kern/_src/psi_comp/__init__.py @@ -24,10 +24,10 @@ class PSICOMP_RBF(Pickleable): @Cache_this(limit=10, ignore_args=(0,1,2,3)) def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior, - psi0=None, psi1=None, psi2=None): + psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None): if isinstance(variational_posterior, variational.NormalPosterior): return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior, - psi0=psi0, psi1=psi1, psi2=psi2) + psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2) elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior): return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior) else: diff --git a/GPy/kern/_src/psi_comp/rbf_psi_comp.py b/GPy/kern/_src/psi_comp/rbf_psi_comp.py index de958a37..eaf2b04d 100644 --- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py +++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py @@ -69,11 +69,11 @@ def __psi2computations(variance, lengthscale, Z, mu, S): return _psi2 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior, - psi0=None, psi1=None, psi2=None): + psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None): ARD = (len(lengthscale)!=1) - dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi1=psi1) - dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi2=psi2) + dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi1=psi1, Lpsi1=Lpsi1) + dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi2=psi2, Lpsi2=Lpsi2) dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2 @@ -87,7 +87,7 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscal return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS -def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None): +def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None, Lpsi1=None): """ dL_dpsi1 - NxM Z - MxQ @@ -108,7 +108,8 @@ def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None): if psi1 is None: psi1 = _psi1computations(variance, lengthscale, Z, mu, S) - Lpsi1 = dL_dpsi1*psi1 + if Lpsi1 is None: + Lpsi1 = dL_dpsi1*psi1 Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ denom = 1./(S+lengthscale2) Zmu2_denom = np.square(Zmu)*denom[:,None,:] #NxMxQ @@ -120,7 +121,7 @@ def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None): return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS -def __psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, psi2=None): +def __psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, psi2=None, Lpsi2=None): """ Z - MxQ mu - NxQ @@ -143,7 +144,8 @@ def __psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, psi2=None): if psi2 is None: psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM - Lpsi2 = dL_dpsi2*psi2 # dL_dpsi2 is MxM, using broadcast to multiply N out + if Lpsi2 is None: + Lpsi2 = dL_dpsi2*psi2 # dL_dpsi2 is MxM, using broadcast to multiply N out Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ Lpsi2Z2 = np.einsum('nmo,oq,oq->nq',Lpsi2,Z,Z) #NxQ diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index 357c48ec..3ba782b1 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -59,16 +59,16 @@ class RBF(Stationary): return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior, return_psi2_n=self.return_psi2_n)[2] def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, - psi0=None, psi1=None, psi2=None): - dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[:2] + psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None): + dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[:2] self.variance.gradient = dL_dvar self.lengthscale.gradient = dL_dlengscale def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, - psi0=None, psi1=None, psi2=None): - return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[2] + psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None): + return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[2] def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, - psi0=None, psi1=None, psi2=None): - return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[3:] + psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None): + return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[3:] diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py index 8b8c6feb..0828ed6b 100644 --- a/GPy/models/bayesian_gplvm_minibatch.py +++ b/GPy/models/bayesian_gplvm_minibatch.py @@ -126,7 +126,8 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch): Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'], dL_dpsi1=full_values['dL_dpsi1'], dL_dpsi2=full_values['dL_dpsi2'], - psi0=self.psi0, psi1=self.psi1, psi2=self.psi2) + psi0=self.psi0, psi1=self.psi1, psi2=self.psi2, + Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2']) full_values['meangrad'] += meangrad_tmp full_values['vargrad'] += vargrad_tmp else: @@ -156,6 +157,11 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch): full_values['vargrad'] = np.zeros((self.X.shape[0], self.X.shape[1])) full_values['dL_dpsi0'] = np.zeros(self.X.shape[0]) full_values['dL_dpsi1'] = np.zeros((self.X.shape[0], self.Z.shape[0])) + full_values['dL_dpsi2'] = np.zeros((self.Z.shape[0], self.Z.shape[0])) + + full_values['Lpsi0'] = np.zeros(self.X.shape[0]) + full_values['Lpsi1'] = np.zeros((self.X.shape[0], self.Z.shape[0])) + full_values['Lpsi2'] = np.zeros((self.X.shape[0], self.Z.shape[0], self.Z.shape[0])) return full_values def parameters_changed(self): diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py index c9d13e6b..e13d8061 100644 --- a/GPy/models/sparse_gp_minibatch.py +++ b/GPy/models/sparse_gp_minibatch.py @@ -106,6 +106,10 @@ class SparseGPMiniBatch(SparseGP): posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, psi0=psi0, psi1=psi1, psi2=psi2_sum_n, **kwargs) + if self.has_uncertain_inputs(): + grad_dict['Lpsi0'] = grad_dict['dL_dpsi0']*psi0 + grad_dict['Lpsi1'] = grad_dict['dL_dpsi1']*psi1 + grad_dict['Lpsi2'] = grad_dict['dL_dpsi2']*psi2 return posterior, log_marginal_likelihood, grad_dict def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None): @@ -172,7 +176,8 @@ class SparseGPMiniBatch(SparseGP): Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'], dL_dpsi1=full_values['dL_dpsi1'], dL_dpsi2=full_values['dL_dpsi2'], - psi0=self.psi0, psi1=self.psi1, psi2=self.psi2) + psi0=self.psi0, psi1=self.psi1, psi2=self.psi2, + Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2']) #self.kern.update_gradients_expectations(variational_posterior=self.X, #Z=self.Z, #dL_dpsi0=full_values['dL_dpsi0'], @@ -187,7 +192,8 @@ class SparseGPMiniBatch(SparseGP): Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'], dL_dpsi1=full_values['dL_dpsi1'], dL_dpsi2=full_values['dL_dpsi2'], - psi0=self.psi0, psi1=self.psi1, psi2=self.psi2) + psi0=self.psi0, psi1=self.psi1, psi2=self.psi2, + Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2']) else: #gradients wrt kernel self.kern.update_gradients_diag(full_values['dL_dKdiag'], self.X) @@ -267,7 +273,9 @@ class SparseGPMiniBatch(SparseGP): psi1ni = psi1[ninan] if self.has_uncertain_inputs(): psi2ni = psi2[ninan] - value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, meangrad=ninan, vargrad=ninan) + #value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, meangrad=ninan, vargrad=ninan) + value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, meangrad=ninan, vargrad=ninan, + Lpsi0=ninan, Lpsi1=ninan, Lpsi2=ninan) else: psi2ni = None value_indices = dict(outputs=d, samples=ninan, dL_dKdiag=ninan, dL_dKnm=ninan)