From 29b9435a8e1fec3630a839515c73ae0ea731630f Mon Sep 17 00:00:00 2001 From: Siivola Eero Date: Fri, 7 Sep 2018 21:03:27 +0300 Subject: [PATCH] Simplified the way how multioutput kernel updates and uses gradients. As a consequense, DiffKern cannot be used as a standalone kernel anymore --- GPy/kern/src/diff_kern.py | 28 +++-- GPy/kern/src/kern.py | 12 +- GPy/kern/src/kernel_slice_operations.py | 123 +++++++++++++++++++- GPy/kern/src/multioutput_derivative_kern.py | 30 ++--- GPy/kern/src/multioutput_kern.py | 32 ++--- GPy/kern/src/stationary.py | 4 - 6 files changed, 179 insertions(+), 50 deletions(-) diff --git a/GPy/kern/src/diff_kern.py b/GPy/kern/src/diff_kern.py index b8019032..612c0632 100644 --- a/GPy/kern/src/diff_kern.py +++ b/GPy/kern/src/diff_kern.py @@ -1,21 +1,21 @@ # Copyright (c) 2018, GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) -from .kern import CombinationKernel +from .kern import Kern import numpy as np from paramz.caching import Cache_this -class DiffKern(CombinationKernel): +class DiffKern(Kern): """ Diff kernel is a thin wrapper for using partial derivatives of kernels as kernels. Eg. in combination with Multioutput kernel this allows the user to train GPs with observations of latent function and latent - function derivatives + function derivatives. NOTE: DiffKern only works when used with Multioutput kernel. Do not use the kernel as standalone The parameters the kernel needs are: -'base_kern': a member of Kernel class that is used for observations -'dimension': integer that indigates in which dimensions the partial derivative observations are """ def __init__(self, base_kern, dimension): - super(DiffKern, self).__init__([base_kern], 'DiffKern') + super(DiffKern, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='DiffKern') self.base_kern = base_kern self.dimension = dimension @@ -45,11 +45,13 @@ class DiffKern(CombinationKernel): def reset_gradients(self): self.base_kern.reset_gradients() - def get_gradient(self): + @property + def gradient(self): return self.base_kern.gradient - def append_gradient(self, gradient): - self.base_kern.gradient += gradient + @gradient.setter + def gradient(self, gradient): + self.base_kern.gradient = gradient def update_gradients_full(self, dL_dK, X, X2=None, dimX2=None): if dimX2 is None: @@ -65,11 +67,19 @@ class DiffKern(CombinationKernel): if X2 is None: X2 = X gradients = self.base_kern.dgradients_dX(X,X2, self.dimension) - self.base_kern.append_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients]) + self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients]) def update_gradients_dK_dX2(self, dL_dK, X, X2=None): gradients = self.base_kern.dgradients_dX2(X,X2, self.dimension) - self.base_kern.append_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients]) + self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients]) + + def gradients_X(self, dL_dK, X, X2): + tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,:, self.dimension] + return np.sum(tmp, axis=1) + + def gradients_X2(self, dL_dK, X, X2): + tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:, :, self.dimension, :] + return np.sum(tmp, axis=1) def _convert_gradients(self, l,g, f = lambda x:x): if type(g) is np.ndarray: diff --git a/GPy/kern/src/kern.py b/GPy/kern/src/kern.py index 20061bc5..362ac418 100644 --- a/GPy/kern/src/kern.py +++ b/GPy/kern/src/kern.py @@ -116,6 +116,12 @@ class Kern(Parameterized): except: return X[:, self._all_dims_active] + def _project_dim(self, dim): + try: + return np.where(self._all_dims_active == dim)[0][0] + except: + return None + def K(self, X, X2): """ Compute the kernel function. @@ -200,12 +206,6 @@ class Kern(Parameterized): def reset_gradients(self): raise NotImplementedError - - def get_gradient(self): - return self.gradient - - def append_gradient(self,gradient): - self.gradient += gradient def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): """ diff --git a/GPy/kern/src/kernel_slice_operations.py b/GPy/kern/src/kernel_slice_operations.py index 327436e7..8a5dcb81 100644 --- a/GPy/kern/src/kernel_slice_operations.py +++ b/GPy/kern/src/kernel_slice_operations.py @@ -19,16 +19,26 @@ def put_clean(dct, name, func): class KernCallsViaSlicerMeta(ParametersChangedMeta): def __new__(cls, name, bases, dct): put_clean(dct, 'K', _slice_K) + put_clean(dct, 'dK_dX', _slice_dK_dX) + put_clean(dct, 'dK_dX2', _slice_dK_dX) + put_clean(dct, 'dK2_dXdX2', _slice_dK2_dXdX2) put_clean(dct, 'Kdiag', _slice_Kdiag) put_clean(dct, 'phi', _slice_Kdiag) put_clean(dct, 'update_gradients_full', _slice_update_gradients_full) put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag) + put_clean(dct, 'update_gradients_dK_dX', _slice_update_gradients_full) + put_clean(dct, 'update_gradients_dK_dX2', _slice_update_gradients_full) put_clean(dct, 'gradients_X', _slice_gradients_X) + put_clean(dct, 'gradients_X2', _slice_gradients_X) put_clean(dct, 'gradients_X_X2', _slice_gradients_X) put_clean(dct, 'gradients_XX', _slice_gradients_XX) put_clean(dct, 'gradients_XX_diag', _slice_gradients_XX_diag) put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag) + put_clean(dct, 'dgradients_dX',_slice_partial_gradients_list_X) + put_clean(dct, 'dgradients_dX2',_slice_partial_gradients_list_X) + put_clean(dct, 'dgradients2_dXdX2',_slice_partial_gradients_list_XX) + put_clean(dct, 'psi0', _slice_psi) put_clean(dct, 'psi1', _slice_psi) put_clean(dct, 'psi2', _slice_psi) @@ -79,6 +89,20 @@ class _Slice_wrap(object): return ret return return_val + def handle_return_list(self, return_val): + if self.ret: + ret = [np.zeros(self.shape) for _ in range(len(return_val))] + if len(self.shape) == 3: + for i in range(len(return_val)): + ret[i][self.k._all_dims_active, :, :] = return_val[i] + #[ret[i].__setitem__((:, :, self.k._all_dims_active), return_val[i]) for i in range(len(return_val))] + elif len(self.shape) == 4: + for i in range(len(return_val)): + ret[i][np.ix_(self.k._all_dims_active, self.k._all_dims_active)] = return_val[i] + #[ret[i].__setitem__((:, :, self.k._all_dims_active, self.k._all_dims_active), return_val[i]) for i in range(len(return_val))] + return ret + return return_val + def _slice_K(f): @wraps(f) def wrap(self, X, X2 = None, *a, **kw): @@ -97,15 +121,15 @@ def _slice_Kdiag(f): def _slice_update_gradients_full(f): @wraps(f) - def wrap(self, dL_dK, X, X2=None): + def wrap(self, dL_dK, X, X2=None, *a, **kw): with _Slice_wrap(self, X, X2) as s: - ret = f(self, dL_dK, s.X, s.X2) + ret = f(self, dL_dK, s.X, s.X2, *a, **kw) return ret return wrap def _slice_update_gradients_diag(f): @wraps(f) - def wrap(self, dL_dKdiag, X): + def wrap(self, dL_dKdiag, X, *a, **kw): with _Slice_wrap(self, X, None) as s: ret = f(self, dL_dKdiag, s.X) return ret @@ -119,6 +143,99 @@ def _slice_gradients_X(f): return ret return wrap +def _slice_dK_dX(f): + @wraps(f) + def wrap(self, X, X2, dim, *a, **kw): + with _Slice_wrap(self, X, X2) as s: + d = s.k._project_dim(dim) + if d is None: + ret = np.zeros((X.shape[0], X2.shape[0])) + else: + ret = f(self, s.X, s.X2, d, *a, **kw) + return ret + return wrap + +def _slice_dK2_dXdX2(f): + @wraps(f) + def wrap(self, X, X2, dimX, dimX2, *a, **kw): + with _Slice_wrap(self, X, X2) as s: + d = s.k._project_dim(dimX) + d2 = s.k._project_dim(dimX2) + if (d is None) or (d2 is None): + ret = np.zeros((X.shape[0], X2.shape[0])) + else: + ret = f(self, s.X, s.X2, d, d2, *a, **kw) + return ret + return wrap + +def _slice_partial_gradients_X(f): + @wraps(f) + def wrap(self, X, X2, dim): + if X2 is None: + N, M = X.shape[0], X.shape[0] + else: + N, M = X.shape[0], X2.shape[0] + Q1 = X.shape[1] + with _Slice_wrap(self, X, X2, ret_shape=(N, M, Q1)) as s: + ret = s.handle_return_array(f(self, s.X, s.X2, dim)) + return ret + return wrap + +def _slice_partial_gradients_list_X(f): + @wraps(f) + def wrap(self, X, X2, dim): + if X2 is None: + N, M = X.shape[0], X.shape[0] + else: + N, M = X.shape[0], X2.shape[0] + with _Slice_wrap(self, X, X2, ret_shape=(N, M)) as s: + d = s.k._project_dim(dim) + if d is None: + ret = [np.zeros((N, M)) for i in range(s.k.size)] + else: + ret = f(self, s.X, s.X2, d) + return ret + return wrap + +def _slice_partial_gradients_XX(f): + @wraps(f) + def wrap(self, X, X2, dim, dimX2): + if X2 is None: + N, M = X.shape[0], X.shape[0] + Q1 = X.shape[1] + else: + N, M = X.shape[0], X2.shape[0] + Q1, Q2 = X.shape[1], X2.shape[1] + with _Slice_wrap(self, X, X2, ret_shape=(N, M, Q1, Q2)) as s: + ret = s.handle_return_array(f(self, s.X, s.X2, dim, dimX2)) + return ret + return wrap + +def _slice_partial_gradients_list_XX(f): + @wraps(f) + def wrap(self, X, X2, dimX, dimX2): + if X2 is None: + N, M = X.shape[0], X.shape[0] + else: + N, M = X.shape[0], X2.shape[0] + with _Slice_wrap(self, X, X2, ret_shape=(N, M)) as s: + d = s.k._project_dim(dimX) + d2 = s.k._project_dim(dimX2) + if (d is None) or (d2 is None): + ret = [np.zeros((N, M)) for i in range(s.k.size)] + else: + ret = f(self, s.X, s.X2, d, d2) + return ret + return wrap + +def _slice_gradient_derivatives_X(f): + @wraps(f) + def wrap(self, dL_dK, X, X2=None): + with _Slice_wrap(self, X, X2) as s: + ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2)) + return ret + return wrap + def _slice_gradients_X_diag(f): @wraps(f) def wrap(self, dL_dKdiag, X): diff --git a/GPy/kern/src/multioutput_derivative_kern.py b/GPy/kern/src/multioutput_derivative_kern.py index 4054947a..17da4a13 100644 --- a/GPy/kern/src/multioutput_derivative_kern.py +++ b/GPy/kern/src/multioutput_derivative_kern.py @@ -4,29 +4,32 @@ from .kern import Kern, CombinationKernel from .multioutput_kern import MultioutputKern, ZeroKern import numpy as np +from functools import partial class KernWrapper(Kern): def __init__(self, fk, fug, fg, base_kern): self.fk = fk self.fug = fug self.fg = fg - self.base_kern - super(KernWrapper, self).__init__(1, None, name='KernWrapper',useGPU=False) + self.base_kern = base_kern + super(KernWrapper, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='KernWrapper',useGPU=False) def K(self, X, X2=None): return self.fk(X,X2=X2) def update_gradients_full(self,dL_dK, X, X2=None): - return self.fug(dK_dK, X, X2=X2) + return self.fug(dL_dK, X, X2=X2) def gradients_X(self,dL_dK, X, X2=None): return self.fg(dL_dK, X, X2=X2) - def get_gradient(self): + @property + def gradient(self): return self.base_kern.gradient - def append_gradient(self, gradient): - self.base_kern.gradient += gradient + @gradient.setter + def gradient(self, gradient): + self.base_kern.gradient = gradient class MultioutputDerivativeKern(MultioutputKern): """ @@ -58,26 +61,23 @@ class MultioutputDerivativeKern(MultioutputKern): unique=True for j in range(0,nl): if i==j or (kernels[i] is kernels[j]): - covariance[i][j] = {'kern': kernels[i],'K':kernels[i].K, 'update_gradients_full': kernels[i].update_gradients_full, 'gradients_X': kernels[i].gradients_X} + kern = kernels[i] if i>j: unique=False elif cross_covariances.get((i,j)) is not None: #cross covariance is given - covariance[i][j] = cross_covariances.get((i,j)) - elif kernels[i].name == 'diffKern' and kernels[i].base_kern == kernels[j]: # one is derivative of other + kern = cross_covariances.get((i,j)) + elif kernels[i].name == 'DiffKern' and kernels[i].base_kern == kernels[j]: # one is derivative of other kern = KernWrapper(kernels[i].dK_dX_wrap,kernels[i].update_gradients_dK_dX,kernels[i].gradients_X, kernels[j]) - covariance[i][j] = {'kern':kern, 'K': kern.K, 'update_gradients_full': kern.update_gradients_full, 'gradients_X': kern.gradients_X} unique=False - elif kernels[j].name == 'diffKern' and kernels[j].base_kern == kernels[i]: # one is derivative of other + elif kernels[j].name == 'DiffKern' and kernels[j].base_kern == kernels[i]: # one is derivative of other kern = KernWrapper(kernels[j].dK_dX2_wrap,kernels[j].update_gradients_dK_dX2,kernels[j].gradients_X2, kernels[i]) - covariance[i][j] = {'kern':kern, 'K': kern.K, 'update_gradients_full': kern.update_gradients_full, 'gradients_X': kern.gradients_X} - elif kernels[i].name == 'diffKern' and kernels[j].name == 'diffKern' and kernels[i].base_kern == kernels[j].base_kern: #both are partial derivatives + elif kernels[i].name == 'DiffKern' and kernels[j].name == 'DiffKern' and kernels[i].base_kern == kernels[j].base_kern: #both are partial derivatives kern = KernWrapper(partial(kernels[i].K, dimX2=kernels[j].dimension), partial(kernels[i].update_gradients_full, dimX2=kernels[j].dimension),None, kernels[i].base_kern) - covariance[i][j] = {'kern':kern, 'K': kern.K, 'update_gradients_full': kern.update_gradients_full, 'gradients_X': kern.gradients_X} if i>j: unique=False else: kern = ZeroKern() - covariance[i][j] = {'kern': kern, 'K': kern.K, 'update_gradients_full': kern.update_gradients_full, 'gradients_X': kern.gradients_X} + covariance[i][j] = kern if unique is True: linked.append(i) self.covariance = covariance diff --git a/GPy/kern/src/multioutput_kern.py b/GPy/kern/src/multioutput_kern.py index a28d5bbb..6bedeb59 100644 --- a/GPy/kern/src/multioutput_kern.py +++ b/GPy/kern/src/multioutput_kern.py @@ -21,6 +21,13 @@ class ZeroKern(Kern): def gradients_X(self,dL_dK, X, X2=None): return np.zeros((X.shape[0],X.shape[1])) + @property + def gradient(self): + return np.empty((1,0)) + + @gradient.setter + def gradient(self, gradient): + pass class MultioutputKern(CombinationKernel): """ @@ -65,14 +72,13 @@ class MultioutputKern(CombinationKernel): unique=True for j in range(0,nl): if i==j or (kernels[i] is kernels[j]): - covariance[i][j] = {'kern': kernels[i], 'K': kernels[i].K, 'update_gradients_full': kernels[i].update_gradients_full, 'gradients_X': kernels[i].gradients_X} + covariance[i][j] = kernels[i] if i>j: unique=False elif cross_covariances.get((i,j)) is not None: #cross covariance is given covariance[i][j] = cross_covariances.get((i,j)) else: # zero covariance structure - kern = ZeroKern() - covariance[i][j] = {'kern': kern, 'K': kern.K, 'update_gradients_full': kern.update_gradients_full, 'gradients_X': kern.gradients_X} + covariance[i][j] = ZeroKern() if unique is True: linked.append(i) self.covariance = covariance @@ -85,7 +91,7 @@ class MultioutputKern(CombinationKernel): slices = index_to_slices(X[:,self.index_dim]) slices2 = index_to_slices(X2[:,self.index_dim]) target = np.zeros((X.shape[0], X2.shape[0])) - [[[[ target.__setitem__((slices[i][k],slices2[j][l]), self.covariance[i][j]['K'](X[slices[i][k],:],X2[slices2[j][l],:])) for k in range( len(slices[i]))] for l in range(len(slices2[j])) ] for i in range(len(slices))] for j in range(len(slices2))] + [[[[ target.__setitem__((slices[i][k],slices2[j][l]), self.covariance[i][j].K(X[slices[i][k],:],X2[slices2[j][l],:])) for k in range( len(slices[i]))] for l in range(len(slices2[j])) ] for i in range(len(slices))] for j in range(len(slices2))] return target @Cache_this(limit=3, ignore_args=()) @@ -96,15 +102,15 @@ class MultioutputKern(CombinationKernel): [[np.copyto(target[s], kern.Kdiag(X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)] return target - def _update_gradients_full_wrapper(self, cov_struct, dL_dK, X, X2): - gradient = cov_struct['kern'].get_gradient().copy() - cov_struct['update_gradients_full'](dL_dK, X, X2) - cov_struct['kern'].append_gradient(gradient) + def _update_gradients_full_wrapper(self, kern, dL_dK, X, X2): + gradient = kern.gradient.copy() + kern.update_gradients_full(dL_dK, X, X2) + kern.gradient += gradient def _update_gradients_diag_wrapper(self, kern, dL_dKdiag, X): - gradient = kern.get_gradient().copy() + gradient = kern.gradient.copy() kern.update_gradients_diag(dL_dKdiag, X) - kern.append_gradient(gradient) + kern.gradient += gradient def reset_gradients(self): for kern in self.kern: kern.reset_gradients() @@ -121,14 +127,14 @@ class MultioutputKern(CombinationKernel): def update_gradients_diag(self, dL_dKdiag, X): self.reset_gradients() slices = index_to_slices(X[:,self.index_dim]) - [[ self._update_gradients_diag_wrapper(self.covariance[i][i]['kern'], dL_dKdiag[slices[i][k]], X[slices[i][k],:]) for k in range(len(slices[i]))] for i in range(len(slices))] + [[ self._update_gradients_diag_wrapper(self.covariance[i][i], dL_dKdiag[slices[i][k]], X[slices[i][k],:]) for k in range(len(slices[i]))] for i in range(len(slices))] def gradients_X(self,dL_dK, X, X2=None): slices = index_to_slices(X[:,self.index_dim]) target = np.zeros((X.shape[0], X.shape[1]) ) if X2 is not None: slices2 = index_to_slices(X2[:,self.index_dim]) - [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j]['gradients_X'](dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) ) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))] + [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j].gradients_X(dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) ) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))] else: - [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j]['gradients_X'](dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], (None if (i==j and k==l) else X[slices[j][l],:] )) ) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))] + [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j].gradients_X(dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], (None if (i==j and k==l) else X[slices[j][l],:] )) ) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))] return target \ No newline at end of file diff --git a/GPy/kern/src/stationary.py b/GPy/kern/src/stationary.py index f728e4ae..888320a0 100644 --- a/GPy/kern/src/stationary.py +++ b/GPy/kern/src/stationary.py @@ -221,10 +221,6 @@ class Stationary(Kern): """ self.variance.gradient = dL_dVar self.lengthscale.gradient = dL_dLen - - def append_gradients_direct(self, dL_dVar, dL_dLen): - self.variance.gradient += dL_dVar - self.lengthscale.gradient += dL_dLen def _inv_dist(self, X, X2=None): """