Simplified the way how multioutput kernel updates and uses gradients. As a consequense, DiffKern cannot be used as a standalone kernel anymore

This commit is contained in:
Siivola Eero 2018-09-07 21:03:27 +03:00
parent 03c2838bb9
commit 29b9435a8e
6 changed files with 179 additions and 50 deletions

View file

@ -1,21 +1,21 @@
# Copyright (c) 2018, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
from .kern import CombinationKernel
from .kern import Kern
import numpy as np
from paramz.caching import Cache_this
class DiffKern(CombinationKernel):
class DiffKern(Kern):
"""
Diff kernel is a thin wrapper for using partial derivatives of kernels as kernels. Eg. in combination with
Multioutput kernel this allows the user to train GPs with observations of latent function and latent
function derivatives
function derivatives. NOTE: DiffKern only works when used with Multioutput kernel. Do not use the kernel as standalone
The parameters the kernel needs are:
-'base_kern': a member of Kernel class that is used for observations
-'dimension': integer that indigates in which dimensions the partial derivative observations are
"""
def __init__(self, base_kern, dimension):
super(DiffKern, self).__init__([base_kern], 'DiffKern')
super(DiffKern, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='DiffKern')
self.base_kern = base_kern
self.dimension = dimension
@ -45,11 +45,13 @@ class DiffKern(CombinationKernel):
def reset_gradients(self):
self.base_kern.reset_gradients()
def get_gradient(self):
@property
def gradient(self):
return self.base_kern.gradient
def append_gradient(self, gradient):
self.base_kern.gradient += gradient
@gradient.setter
def gradient(self, gradient):
self.base_kern.gradient = gradient
def update_gradients_full(self, dL_dK, X, X2=None, dimX2=None):
if dimX2 is None:
@ -65,11 +67,19 @@ class DiffKern(CombinationKernel):
if X2 is None:
X2 = X
gradients = self.base_kern.dgradients_dX(X,X2, self.dimension)
self.base_kern.append_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
def update_gradients_dK_dX2(self, dL_dK, X, X2=None):
gradients = self.base_kern.dgradients_dX2(X,X2, self.dimension)
self.base_kern.append_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
def gradients_X(self, dL_dK, X, X2):
tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,:, self.dimension]
return np.sum(tmp, axis=1)
def gradients_X2(self, dL_dK, X, X2):
tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:, :, self.dimension, :]
return np.sum(tmp, axis=1)
def _convert_gradients(self, l,g, f = lambda x:x):
if type(g) is np.ndarray:

View file

@ -116,6 +116,12 @@ class Kern(Parameterized):
except:
return X[:, self._all_dims_active]
def _project_dim(self, dim):
try:
return np.where(self._all_dims_active == dim)[0][0]
except:
return None
def K(self, X, X2):
"""
Compute the kernel function.
@ -201,12 +207,6 @@ class Kern(Parameterized):
def reset_gradients(self):
raise NotImplementedError
def get_gradient(self):
return self.gradient
def append_gradient(self,gradient):
self.gradient += gradient
def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
"""
Set the gradients of all parameters when doing inference with

View file

@ -19,16 +19,26 @@ def put_clean(dct, name, func):
class KernCallsViaSlicerMeta(ParametersChangedMeta):
def __new__(cls, name, bases, dct):
put_clean(dct, 'K', _slice_K)
put_clean(dct, 'dK_dX', _slice_dK_dX)
put_clean(dct, 'dK_dX2', _slice_dK_dX)
put_clean(dct, 'dK2_dXdX2', _slice_dK2_dXdX2)
put_clean(dct, 'Kdiag', _slice_Kdiag)
put_clean(dct, 'phi', _slice_Kdiag)
put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
put_clean(dct, 'update_gradients_dK_dX', _slice_update_gradients_full)
put_clean(dct, 'update_gradients_dK_dX2', _slice_update_gradients_full)
put_clean(dct, 'gradients_X', _slice_gradients_X)
put_clean(dct, 'gradients_X2', _slice_gradients_X)
put_clean(dct, 'gradients_X_X2', _slice_gradients_X)
put_clean(dct, 'gradients_XX', _slice_gradients_XX)
put_clean(dct, 'gradients_XX_diag', _slice_gradients_XX_diag)
put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)
put_clean(dct, 'dgradients_dX',_slice_partial_gradients_list_X)
put_clean(dct, 'dgradients_dX2',_slice_partial_gradients_list_X)
put_clean(dct, 'dgradients2_dXdX2',_slice_partial_gradients_list_XX)
put_clean(dct, 'psi0', _slice_psi)
put_clean(dct, 'psi1', _slice_psi)
put_clean(dct, 'psi2', _slice_psi)
@ -79,6 +89,20 @@ class _Slice_wrap(object):
return ret
return return_val
def handle_return_list(self, return_val):
if self.ret:
ret = [np.zeros(self.shape) for _ in range(len(return_val))]
if len(self.shape) == 3:
for i in range(len(return_val)):
ret[i][self.k._all_dims_active, :, :] = return_val[i]
#[ret[i].__setitem__((:, :, self.k._all_dims_active), return_val[i]) for i in range(len(return_val))]
elif len(self.shape) == 4:
for i in range(len(return_val)):
ret[i][np.ix_(self.k._all_dims_active, self.k._all_dims_active)] = return_val[i]
#[ret[i].__setitem__((:, :, self.k._all_dims_active, self.k._all_dims_active), return_val[i]) for i in range(len(return_val))]
return ret
return return_val
def _slice_K(f):
@wraps(f)
def wrap(self, X, X2 = None, *a, **kw):
@ -97,15 +121,15 @@ def _slice_Kdiag(f):
def _slice_update_gradients_full(f):
@wraps(f)
def wrap(self, dL_dK, X, X2=None):
def wrap(self, dL_dK, X, X2=None, *a, **kw):
with _Slice_wrap(self, X, X2) as s:
ret = f(self, dL_dK, s.X, s.X2)
ret = f(self, dL_dK, s.X, s.X2, *a, **kw)
return ret
return wrap
def _slice_update_gradients_diag(f):
@wraps(f)
def wrap(self, dL_dKdiag, X):
def wrap(self, dL_dKdiag, X, *a, **kw):
with _Slice_wrap(self, X, None) as s:
ret = f(self, dL_dKdiag, s.X)
return ret
@ -119,6 +143,99 @@ def _slice_gradients_X(f):
return ret
return wrap
def _slice_dK_dX(f):
@wraps(f)
def wrap(self, X, X2, dim, *a, **kw):
with _Slice_wrap(self, X, X2) as s:
d = s.k._project_dim(dim)
if d is None:
ret = np.zeros((X.shape[0], X2.shape[0]))
else:
ret = f(self, s.X, s.X2, d, *a, **kw)
return ret
return wrap
def _slice_dK2_dXdX2(f):
@wraps(f)
def wrap(self, X, X2, dimX, dimX2, *a, **kw):
with _Slice_wrap(self, X, X2) as s:
d = s.k._project_dim(dimX)
d2 = s.k._project_dim(dimX2)
if (d is None) or (d2 is None):
ret = np.zeros((X.shape[0], X2.shape[0]))
else:
ret = f(self, s.X, s.X2, d, d2, *a, **kw)
return ret
return wrap
def _slice_partial_gradients_X(f):
@wraps(f)
def wrap(self, X, X2, dim):
if X2 is None:
N, M = X.shape[0], X.shape[0]
else:
N, M = X.shape[0], X2.shape[0]
Q1 = X.shape[1]
with _Slice_wrap(self, X, X2, ret_shape=(N, M, Q1)) as s:
ret = s.handle_return_array(f(self, s.X, s.X2, dim))
return ret
return wrap
def _slice_partial_gradients_list_X(f):
@wraps(f)
def wrap(self, X, X2, dim):
if X2 is None:
N, M = X.shape[0], X.shape[0]
else:
N, M = X.shape[0], X2.shape[0]
with _Slice_wrap(self, X, X2, ret_shape=(N, M)) as s:
d = s.k._project_dim(dim)
if d is None:
ret = [np.zeros((N, M)) for i in range(s.k.size)]
else:
ret = f(self, s.X, s.X2, d)
return ret
return wrap
def _slice_partial_gradients_XX(f):
@wraps(f)
def wrap(self, X, X2, dim, dimX2):
if X2 is None:
N, M = X.shape[0], X.shape[0]
Q1 = X.shape[1]
else:
N, M = X.shape[0], X2.shape[0]
Q1, Q2 = X.shape[1], X2.shape[1]
with _Slice_wrap(self, X, X2, ret_shape=(N, M, Q1, Q2)) as s:
ret = s.handle_return_array(f(self, s.X, s.X2, dim, dimX2))
return ret
return wrap
def _slice_partial_gradients_list_XX(f):
@wraps(f)
def wrap(self, X, X2, dimX, dimX2):
if X2 is None:
N, M = X.shape[0], X.shape[0]
else:
N, M = X.shape[0], X2.shape[0]
with _Slice_wrap(self, X, X2, ret_shape=(N, M)) as s:
d = s.k._project_dim(dimX)
d2 = s.k._project_dim(dimX2)
if (d is None) or (d2 is None):
ret = [np.zeros((N, M)) for i in range(s.k.size)]
else:
ret = f(self, s.X, s.X2, d, d2)
return ret
return wrap
def _slice_gradient_derivatives_X(f):
@wraps(f)
def wrap(self, dL_dK, X, X2=None):
with _Slice_wrap(self, X, X2) as s:
ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
return ret
return wrap
def _slice_gradients_X_diag(f):
@wraps(f)
def wrap(self, dL_dKdiag, X):

View file

@ -4,29 +4,32 @@
from .kern import Kern, CombinationKernel
from .multioutput_kern import MultioutputKern, ZeroKern
import numpy as np
from functools import partial
class KernWrapper(Kern):
def __init__(self, fk, fug, fg, base_kern):
self.fk = fk
self.fug = fug
self.fg = fg
self.base_kern
super(KernWrapper, self).__init__(1, None, name='KernWrapper',useGPU=False)
self.base_kern = base_kern
super(KernWrapper, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='KernWrapper',useGPU=False)
def K(self, X, X2=None):
return self.fk(X,X2=X2)
def update_gradients_full(self,dL_dK, X, X2=None):
return self.fug(dK_dK, X, X2=X2)
return self.fug(dL_dK, X, X2=X2)
def gradients_X(self,dL_dK, X, X2=None):
return self.fg(dL_dK, X, X2=X2)
def get_gradient(self):
@property
def gradient(self):
return self.base_kern.gradient
def append_gradient(self, gradient):
self.base_kern.gradient += gradient
@gradient.setter
def gradient(self, gradient):
self.base_kern.gradient = gradient
class MultioutputDerivativeKern(MultioutputKern):
"""
@ -58,26 +61,23 @@ class MultioutputDerivativeKern(MultioutputKern):
unique=True
for j in range(0,nl):
if i==j or (kernels[i] is kernels[j]):
covariance[i][j] = {'kern': kernels[i],'K':kernels[i].K, 'update_gradients_full': kernels[i].update_gradients_full, 'gradients_X': kernels[i].gradients_X}
kern = kernels[i]
if i>j:
unique=False
elif cross_covariances.get((i,j)) is not None: #cross covariance is given
covariance[i][j] = cross_covariances.get((i,j))
elif kernels[i].name == 'diffKern' and kernels[i].base_kern == kernels[j]: # one is derivative of other
kern = cross_covariances.get((i,j))
elif kernels[i].name == 'DiffKern' and kernels[i].base_kern == kernels[j]: # one is derivative of other
kern = KernWrapper(kernels[i].dK_dX_wrap,kernels[i].update_gradients_dK_dX,kernels[i].gradients_X, kernels[j])
covariance[i][j] = {'kern':kern, 'K': kern.K, 'update_gradients_full': kern.update_gradients_full, 'gradients_X': kern.gradients_X}
unique=False
elif kernels[j].name == 'diffKern' and kernels[j].base_kern == kernels[i]: # one is derivative of other
elif kernels[j].name == 'DiffKern' and kernels[j].base_kern == kernels[i]: # one is derivative of other
kern = KernWrapper(kernels[j].dK_dX2_wrap,kernels[j].update_gradients_dK_dX2,kernels[j].gradients_X2, kernels[i])
covariance[i][j] = {'kern':kern, 'K': kern.K, 'update_gradients_full': kern.update_gradients_full, 'gradients_X': kern.gradients_X}
elif kernels[i].name == 'diffKern' and kernels[j].name == 'diffKern' and kernels[i].base_kern == kernels[j].base_kern: #both are partial derivatives
elif kernels[i].name == 'DiffKern' and kernels[j].name == 'DiffKern' and kernels[i].base_kern == kernels[j].base_kern: #both are partial derivatives
kern = KernWrapper(partial(kernels[i].K, dimX2=kernels[j].dimension), partial(kernels[i].update_gradients_full, dimX2=kernels[j].dimension),None, kernels[i].base_kern)
covariance[i][j] = {'kern':kern, 'K': kern.K, 'update_gradients_full': kern.update_gradients_full, 'gradients_X': kern.gradients_X}
if i>j:
unique=False
else:
kern = ZeroKern()
covariance[i][j] = {'kern': kern, 'K': kern.K, 'update_gradients_full': kern.update_gradients_full, 'gradients_X': kern.gradients_X}
covariance[i][j] = kern
if unique is True:
linked.append(i)
self.covariance = covariance

View file

@ -21,6 +21,13 @@ class ZeroKern(Kern):
def gradients_X(self,dL_dK, X, X2=None):
return np.zeros((X.shape[0],X.shape[1]))
@property
def gradient(self):
return np.empty((1,0))
@gradient.setter
def gradient(self, gradient):
pass
class MultioutputKern(CombinationKernel):
"""
@ -65,14 +72,13 @@ class MultioutputKern(CombinationKernel):
unique=True
for j in range(0,nl):
if i==j or (kernels[i] is kernels[j]):
covariance[i][j] = {'kern': kernels[i], 'K': kernels[i].K, 'update_gradients_full': kernels[i].update_gradients_full, 'gradients_X': kernels[i].gradients_X}
covariance[i][j] = kernels[i]
if i>j:
unique=False
elif cross_covariances.get((i,j)) is not None: #cross covariance is given
covariance[i][j] = cross_covariances.get((i,j))
else: # zero covariance structure
kern = ZeroKern()
covariance[i][j] = {'kern': kern, 'K': kern.K, 'update_gradients_full': kern.update_gradients_full, 'gradients_X': kern.gradients_X}
covariance[i][j] = ZeroKern()
if unique is True:
linked.append(i)
self.covariance = covariance
@ -85,7 +91,7 @@ class MultioutputKern(CombinationKernel):
slices = index_to_slices(X[:,self.index_dim])
slices2 = index_to_slices(X2[:,self.index_dim])
target = np.zeros((X.shape[0], X2.shape[0]))
[[[[ target.__setitem__((slices[i][k],slices2[j][l]), self.covariance[i][j]['K'](X[slices[i][k],:],X2[slices2[j][l],:])) for k in range( len(slices[i]))] for l in range(len(slices2[j])) ] for i in range(len(slices))] for j in range(len(slices2))]
[[[[ target.__setitem__((slices[i][k],slices2[j][l]), self.covariance[i][j].K(X[slices[i][k],:],X2[slices2[j][l],:])) for k in range( len(slices[i]))] for l in range(len(slices2[j])) ] for i in range(len(slices))] for j in range(len(slices2))]
return target
@Cache_this(limit=3, ignore_args=())
@ -96,15 +102,15 @@ class MultioutputKern(CombinationKernel):
[[np.copyto(target[s], kern.Kdiag(X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
return target
def _update_gradients_full_wrapper(self, cov_struct, dL_dK, X, X2):
gradient = cov_struct['kern'].get_gradient().copy()
cov_struct['update_gradients_full'](dL_dK, X, X2)
cov_struct['kern'].append_gradient(gradient)
def _update_gradients_full_wrapper(self, kern, dL_dK, X, X2):
gradient = kern.gradient.copy()
kern.update_gradients_full(dL_dK, X, X2)
kern.gradient += gradient
def _update_gradients_diag_wrapper(self, kern, dL_dKdiag, X):
gradient = kern.get_gradient().copy()
gradient = kern.gradient.copy()
kern.update_gradients_diag(dL_dKdiag, X)
kern.append_gradient(gradient)
kern.gradient += gradient
def reset_gradients(self):
for kern in self.kern: kern.reset_gradients()
@ -121,14 +127,14 @@ class MultioutputKern(CombinationKernel):
def update_gradients_diag(self, dL_dKdiag, X):
self.reset_gradients()
slices = index_to_slices(X[:,self.index_dim])
[[ self._update_gradients_diag_wrapper(self.covariance[i][i]['kern'], dL_dKdiag[slices[i][k]], X[slices[i][k],:]) for k in range(len(slices[i]))] for i in range(len(slices))]
[[ self._update_gradients_diag_wrapper(self.covariance[i][i], dL_dKdiag[slices[i][k]], X[slices[i][k],:]) for k in range(len(slices[i]))] for i in range(len(slices))]
def gradients_X(self,dL_dK, X, X2=None):
slices = index_to_slices(X[:,self.index_dim])
target = np.zeros((X.shape[0], X.shape[1]) )
if X2 is not None:
slices2 = index_to_slices(X2[:,self.index_dim])
[[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j]['gradients_X'](dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) ) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
[[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j].gradients_X(dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) ) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
else:
[[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j]['gradients_X'](dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], (None if (i==j and k==l) else X[slices[j][l],:] )) ) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
[[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j].gradients_X(dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], (None if (i==j and k==l) else X[slices[j][l],:] )) ) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
return target

View file

@ -222,10 +222,6 @@ class Stationary(Kern):
self.variance.gradient = dL_dVar
self.lengthscale.gradient = dL_dLen
def append_gradients_direct(self, dL_dVar, dL_dLen):
self.variance.gradient += dL_dVar
self.lengthscale.gradient += dL_dLen
def _inv_dist(self, X, X2=None):
"""
Compute the elementwise inverse of the distance matrix, expecpt on the