Try calculating dL_dpsi1*psi1 individually for each dimension as we go along

This commit is contained in:
Alan Saul 2015-08-31 14:13:35 +03:00
parent c83f56723e
commit 3818aa3745
7 changed files with 47 additions and 31 deletions

View file

@ -117,7 +117,7 @@ class Kern(Parameterized):
raise NotImplementedError raise NotImplementedError
def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
psi0=None, psi1=None, psi2=None): psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
""" """
Set the gradients of all parameters when doing inference with Set the gradients of all parameters when doing inference with
uncertain inputs, using expectations of the kernel. uncertain inputs, using expectations of the kernel.
@ -129,26 +129,26 @@ class Kern(Parameterized):
dL_dpsi2 * dpsi2_d{theta_i} dL_dpsi2 * dpsi2_d{theta_i}
""" """
dtheta = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, dtheta = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
psi0=psi0, psi1=psi1, psi2=psi2)[0] psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[0]
self.gradient[:] = dtheta self.gradient[:] = dtheta
def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
psi0=None, psi1=None, psi2=None): psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
""" """
Returns the derivative of the objective wrt Z, using the chain rule Returns the derivative of the objective wrt Z, using the chain rule
through the expectation variables. through the expectation variables.
""" """
return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
psi0=psi0, psi1=psi1, psi2=psi2)[1] psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[1]
def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
psi0=None, psi1=None, psi2=None): psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
""" """
Compute the gradients wrt the parameters of the variational Compute the gradients wrt the parameters of the variational
distruibution q(X), chain-ruling via the expectations of the kernel distruibution q(X), chain-ruling via the expectations of the kernel
""" """
return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
psi0=psi0, psi1=psi1, psi2=psi2)[2:] psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[2:]
def plot(self, x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs): def plot(self, x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs):
""" """

View file

@ -117,30 +117,30 @@ def _slice_psi(f):
def _slice_update_gradients_expectations(f): def _slice_update_gradients_expectations(f):
@wraps(f) @wraps(f)
def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
psi0=None, psi1=None, psi2=None): psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
with _Slice_wrap(self, Z, variational_posterior) as s: with _Slice_wrap(self, Z, variational_posterior) as s:
ret = f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2, ret = f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2,
psi0=psi0, psi1=psi1, psi2=psi2) psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)
return ret return ret
return wrap return wrap
def _slice_gradients_Z_expectations(f): def _slice_gradients_Z_expectations(f):
@wraps(f) @wraps(f)
def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
psi0=None, psi1=None, psi2=None): psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
with _Slice_wrap(self, Z, variational_posterior) as s: with _Slice_wrap(self, Z, variational_posterior) as s:
ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2, ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2,
psi0=psi0, psi1=psi1, psi2=psi2)) psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2))
return ret return ret
return wrap return wrap
def _slice_gradients_qX_expectations(f): def _slice_gradients_qX_expectations(f):
@wraps(f) @wraps(f)
def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
psi0=None, psi1=None, psi2=None): psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
with _Slice_wrap(self, variational_posterior, Z) as s: with _Slice_wrap(self, variational_posterior, Z) as s:
ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X, ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X,
psi0=psi0, psi1=psi1, psi2=psi2)) psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2))
r2 = ret[:2] r2 = ret[:2]
ret[0] = s.handle_return_array(r2[0]) ret[0] = s.handle_return_array(r2[0])
ret[1] = s.handle_return_array(r2[1]) ret[1] = s.handle_return_array(r2[1])

View file

@ -24,10 +24,10 @@ class PSICOMP_RBF(Pickleable):
@Cache_this(limit=10, ignore_args=(0,1,2,3)) @Cache_this(limit=10, ignore_args=(0,1,2,3))
def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior, def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior,
psi0=None, psi1=None, psi2=None): psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
if isinstance(variational_posterior, variational.NormalPosterior): if isinstance(variational_posterior, variational.NormalPosterior):
return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior, return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior,
psi0=psi0, psi1=psi1, psi2=psi2) psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)
elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior): elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior) return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
else: else:

View file

@ -69,11 +69,11 @@ def __psi2computations(variance, lengthscale, Z, mu, S):
return _psi2 return _psi2
def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior, def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior,
psi0=None, psi1=None, psi2=None): psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
ARD = (len(lengthscale)!=1) ARD = (len(lengthscale)!=1)
dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi1=psi1) dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi1=psi1, Lpsi1=Lpsi1)
dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi2=psi2) dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi2=psi2, Lpsi2=Lpsi2)
dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2 dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
@ -87,7 +87,7 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscal
return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None): def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None, Lpsi1=None):
""" """
dL_dpsi1 - NxM dL_dpsi1 - NxM
Z - MxQ Z - MxQ
@ -108,6 +108,7 @@ def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None):
if psi1 is None: if psi1 is None:
psi1 = _psi1computations(variance, lengthscale, Z, mu, S) psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
if Lpsi1 is None:
Lpsi1 = dL_dpsi1*psi1 Lpsi1 = dL_dpsi1*psi1
Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ
denom = 1./(S+lengthscale2) denom = 1./(S+lengthscale2)
@ -120,7 +121,7 @@ def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None):
return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
def __psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, psi2=None): def __psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, psi2=None, Lpsi2=None):
""" """
Z - MxQ Z - MxQ
mu - NxQ mu - NxQ
@ -143,6 +144,7 @@ def __psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, psi2=None):
if psi2 is None: if psi2 is None:
psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM
if Lpsi2 is None:
Lpsi2 = dL_dpsi2*psi2 # dL_dpsi2 is MxM, using broadcast to multiply N out Lpsi2 = dL_dpsi2*psi2 # dL_dpsi2 is MxM, using broadcast to multiply N out
Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N
Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ

View file

@ -59,16 +59,16 @@ class RBF(Stationary):
return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior, return_psi2_n=self.return_psi2_n)[2] return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior, return_psi2_n=self.return_psi2_n)[2]
def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
psi0=None, psi1=None, psi2=None): psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[:2] dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[:2]
self.variance.gradient = dL_dvar self.variance.gradient = dL_dvar
self.lengthscale.gradient = dL_dlengscale self.lengthscale.gradient = dL_dlengscale
def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
psi0=None, psi1=None, psi2=None): psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[2] return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[2]
def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
psi0=None, psi1=None, psi2=None): psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[3:] return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[3:]

View file

@ -126,7 +126,8 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'], Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
dL_dpsi1=full_values['dL_dpsi1'], dL_dpsi1=full_values['dL_dpsi1'],
dL_dpsi2=full_values['dL_dpsi2'], dL_dpsi2=full_values['dL_dpsi2'],
psi0=self.psi0, psi1=self.psi1, psi2=self.psi2) psi0=self.psi0, psi1=self.psi1, psi2=self.psi2,
Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2'])
full_values['meangrad'] += meangrad_tmp full_values['meangrad'] += meangrad_tmp
full_values['vargrad'] += vargrad_tmp full_values['vargrad'] += vargrad_tmp
else: else:
@ -156,6 +157,11 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
full_values['vargrad'] = np.zeros((self.X.shape[0], self.X.shape[1])) full_values['vargrad'] = np.zeros((self.X.shape[0], self.X.shape[1]))
full_values['dL_dpsi0'] = np.zeros(self.X.shape[0]) full_values['dL_dpsi0'] = np.zeros(self.X.shape[0])
full_values['dL_dpsi1'] = np.zeros((self.X.shape[0], self.Z.shape[0])) full_values['dL_dpsi1'] = np.zeros((self.X.shape[0], self.Z.shape[0]))
full_values['dL_dpsi2'] = np.zeros((self.Z.shape[0], self.Z.shape[0]))
full_values['Lpsi0'] = np.zeros(self.X.shape[0])
full_values['Lpsi1'] = np.zeros((self.X.shape[0], self.Z.shape[0]))
full_values['Lpsi2'] = np.zeros((self.X.shape[0], self.Z.shape[0], self.Z.shape[0]))
return full_values return full_values
def parameters_changed(self): def parameters_changed(self):

View file

@ -106,6 +106,10 @@ class SparseGPMiniBatch(SparseGP):
posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm,
dL_dKmm=dL_dKmm, psi0=psi0, psi1=psi1, psi2=psi2_sum_n, **kwargs) dL_dKmm=dL_dKmm, psi0=psi0, psi1=psi1, psi2=psi2_sum_n, **kwargs)
if self.has_uncertain_inputs():
grad_dict['Lpsi0'] = grad_dict['dL_dpsi0']*psi0
grad_dict['Lpsi1'] = grad_dict['dL_dpsi1']*psi1
grad_dict['Lpsi2'] = grad_dict['dL_dpsi2']*psi2
return posterior, log_marginal_likelihood, grad_dict return posterior, log_marginal_likelihood, grad_dict
def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None): def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None):
@ -172,7 +176,8 @@ class SparseGPMiniBatch(SparseGP):
Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'], Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
dL_dpsi1=full_values['dL_dpsi1'], dL_dpsi1=full_values['dL_dpsi1'],
dL_dpsi2=full_values['dL_dpsi2'], dL_dpsi2=full_values['dL_dpsi2'],
psi0=self.psi0, psi1=self.psi1, psi2=self.psi2) psi0=self.psi0, psi1=self.psi1, psi2=self.psi2,
Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2'])
#self.kern.update_gradients_expectations(variational_posterior=self.X, #self.kern.update_gradients_expectations(variational_posterior=self.X,
#Z=self.Z, #Z=self.Z,
#dL_dpsi0=full_values['dL_dpsi0'], #dL_dpsi0=full_values['dL_dpsi0'],
@ -187,7 +192,8 @@ class SparseGPMiniBatch(SparseGP):
Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'], Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
dL_dpsi1=full_values['dL_dpsi1'], dL_dpsi1=full_values['dL_dpsi1'],
dL_dpsi2=full_values['dL_dpsi2'], dL_dpsi2=full_values['dL_dpsi2'],
psi0=self.psi0, psi1=self.psi1, psi2=self.psi2) psi0=self.psi0, psi1=self.psi1, psi2=self.psi2,
Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2'])
else: else:
#gradients wrt kernel #gradients wrt kernel
self.kern.update_gradients_diag(full_values['dL_dKdiag'], self.X) self.kern.update_gradients_diag(full_values['dL_dKdiag'], self.X)
@ -267,7 +273,9 @@ class SparseGPMiniBatch(SparseGP):
psi1ni = psi1[ninan] psi1ni = psi1[ninan]
if self.has_uncertain_inputs(): if self.has_uncertain_inputs():
psi2ni = psi2[ninan] psi2ni = psi2[ninan]
value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, meangrad=ninan, vargrad=ninan) #value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, meangrad=ninan, vargrad=ninan)
value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, meangrad=ninan, vargrad=ninan,
Lpsi0=ninan, Lpsi1=ninan, Lpsi2=ninan)
else: else:
psi2ni = None psi2ni = None
value_indices = dict(outputs=d, samples=ninan, dL_dKdiag=ninan, dL_dKnm=ninan) value_indices = dict(outputs=d, samples=ninan, dL_dKdiag=ninan, dL_dKnm=ninan)