From a3f458926b36d06c920a8ea21a49e65113a39baf Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Tue, 7 Jun 2016 09:24:38 +0100 Subject: [PATCH] [gradsxx] putting tests in, not complete yet! --- GPy/core/gp.py | 11 ++-- GPy/kern/src/kernel_slice_operations.py | 23 +++----- GPy/kern/src/stationary.py | 47 +++++++--------- GPy/testing/kernel_tests.py | 75 +++++++++++++------------ 4 files changed, 72 insertions(+), 84 deletions(-) diff --git a/GPy/core/gp.py b/GPy/core/gp.py index dd9b8b4c..6b1c9cb4 100644 --- a/GPy/core/gp.py +++ b/GPy/core/gp.py @@ -355,7 +355,7 @@ class GP(Model): :param X: The points at which to get the predictive gradients. :type X: np.ndarray (Xnew x self.input_dim) :param kern: The kernel to compute the jacobian for. - :param boolean full_cov: whether to return the cross-covariance terms between + :param boolean full_cov: whether to return the cross-covariance terms between the N* Jacobian vectors :returns: dmu_dX, dv_dX @@ -377,9 +377,10 @@ class GP(Model): if full_cov: dK2_dXdX = kern.gradients_XX(one, Xnew) else: - dK2_dXdX = np.zeros((Xnew.shape[0], Xnew.shape[1], Xnew.shape[1])) - for i in range(Xnew.shape[0]): - dK2_dXdX[i:i+1,:,:] = kern.gradients_XX(one, Xnew[i:i+1,:]) + dK2_dXdX = -kern.gradients_XX(one, Xnew).sum(0) + #dK2_dXdX = np.zeros((Xnew.shape[0], Xnew.shape[1], Xnew.shape[1])) + #for i in range(Xnew.shape[0]): + # dK2_dXdX[i:i+1,:,:] = kern.gradients_XX(one, Xnew[i:i+1,:]) def compute_cov_inner(wi): if full_cov: @@ -424,7 +425,7 @@ class GP(Model): Sigma = var_jac.sum(-1) else: Sigma = self.output_dim*var_jac - + G = 0. if mean: G += mumuT diff --git a/GPy/kern/src/kernel_slice_operations.py b/GPy/kern/src/kernel_slice_operations.py index 73160ef7..c4a64518 100644 --- a/GPy/kern/src/kernel_slice_operations.py +++ b/GPy/kern/src/kernel_slice_operations.py @@ -128,34 +128,25 @@ def _slice_gradients_X_diag(f): def _slice_gradients_XX(f): @wraps(f) - def wrap(self, dL_dK, X, X2=None, cov=True): + def wrap(self, dL_dK, X, X2=None): if X2 is None: N, M = X.shape[0], X.shape[0] Q1 = Q2 = X.shape[1] else: N, M = X.shape[0], X2.shape[0] Q1, Q2 = X.shape[1], X2.shape[1] - if cov: # full covariance - #with _Slice_wrap(self, X, X2, ret_shape=None) as s: - with _Slice_wrap(self, X, X2, ret_shape=(N, M, Q1, Q2)) as s: - ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2, cov=cov)) - else: # diagonal covariance - #with _Slice_wrap(self, X, X2, ret_shape=None) as s: - with _Slice_wrap(self, X, X2, ret_shape=(N, M, Q1)) as s: - ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2, cov=cov)) + #with _Slice_wrap(self, X, X2, ret_shape=None) as s: + with _Slice_wrap(self, X, X2, ret_shape=(N, M, Q1, Q2)) as s: + ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2)) return ret return wrap def _slice_gradients_XX_diag(f): @wraps(f) - def wrap(self, dL_dKdiag, X, cov=True): + def wrap(self, dL_dKdiag, X): N, Q = X.shape - if cov: # full covariance - with _Slice_wrap(self, X, None, diag=True, ret_shape=(N, Q, Q)) as s: - ret = s.handle_return_array(f(self, dL_dKdiag, s.X, cov=cov)) - else: # diagonal covariance - with _Slice_wrap(self, X, None, ret_shape=(N, Q)) as s: - ret = s.handle_return_array(f(self, dL_dKdiag, s.X, cov=cov)) + with _Slice_wrap(self, X, None, diag=True, ret_shape=(N, Q, Q)) as s: + ret = s.handle_return_array(f(self, dL_dKdiag, s.X)) return ret return wrap diff --git a/GPy/kern/src/stationary.py b/GPy/kern/src/stationary.py index e5093d24..22613fa2 100644 --- a/GPy/kern/src/stationary.py +++ b/GPy/kern/src/stationary.py @@ -218,13 +218,13 @@ class Stationary(Kern): else: return self._gradients_X_pure(dL_dK, X, X2) - def gradients_XX(self, dL_dK, X, X2=None, cov=True): + def gradients_XX(self, dL_dK, X, X2=None): """ Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2: - cov = True: returns the full covariance matrix [QxQ] of the input dimensionfor each pair or vectors - cov = False: returns the diagonal of the covariance matrix [QxQ] of the input dimensionfor each pair - or vectors (computationally more efficient if the full covariance matrix is not needed) + returns the full covariance matrix [QxQ] of the input dimensionfor each pair or vectors, thus + the returned array is of shape [NxNxQxQ]. + ..math: \frac{\partial^2 K}{\partial X2 ^2} = - \frac{\partial^2 K}{\partial X\partial X2} @@ -242,45 +242,36 @@ class Stationary(Kern): dL_drdr = self.dK2_drdr_via_X(X, X2) #* dL_dK # we perofrm this product later tmp2 = dL_drdr*invdist2 l2 = np.ones(X.shape[1])*self.lengthscale**2 #np.multiply(np.ones(X.shape[1]) ,self.lengthscale**2) - + if X2 is None: X2 = X tmp1 -= np.eye(X.shape[0])*self.variance else: tmp1[invdist2==0.] -= self.variance - if cov: # full covariance - grad = np.empty((X.shape[0], X2.shape[0], X2.shape[1], X.shape[1]), dtype=np.float64) - dist = X[:,None,:] - X2[None,:,:] - dist = (dist[:,:,:,None]*dist[:,:,None,:]) - I = np.ones((X.shape[0], X2.shape[0], X2.shape[1], X.shape[1]))*np.eye((X2.shape[1])) - grad = (np.einsum('kl,klij->klij',dL_dK*(tmp1*invdist2 - tmp2), dist) /l2[None,None,:,None] - np.einsum('kl,klij->klij',dL_dK*tmp1, I))/l2[None,None,None,:] - else: # Diagonal covariance, old code - grad = np.empty((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64) - #grad = np.empty(X.shape, dtype=np.float64) - for q in range(self.input_dim): - tmpdist2 = (X[:,[q]]-X2[:,[q]].T) ** 2 - grad[:, :, q] = np.multiply(dL_dK,(np.multiply((tmp1*invdist2 - tmp2),tmpdist2)/l2[q] - tmp1)/l2[q]) - #grad[:, :, q] = ((tmp1*invdist2 - tmp2)*tmpdist2/l2[q] - tmp1)/l2[q] - #grad[:, :, q] = ((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q] - #np.sum(((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q], axis=1, out=grad[:,q]) - #np.sum( - (tmp2*(tmpdist**2)), axis=1, out=grad[:,q]) + #grad = np.empty((X.shape[0], X2.shape[0], X2.shape[1], X.shape[1]), dtype=np.float64) + dist = X[:,None,:] - X2[None,:,:] + dist = (dist[:,:,:,None]*dist[:,:,None,:]) + I = np.ones((X.shape[0], X2.shape[0], X2.shape[1], X.shape[1]))*np.eye((X2.shape[1])) + grad = (np.einsum('kl,klij->klij',dL_dK*(tmp1*invdist2 - tmp2), dist) /l2[None,None,:,None] - np.einsum('kl,klij->klij',dL_dK*tmp1, I))/l2[None,None,None,:] return grad - def gradients_XX_diag(self, d2L_dK, X, cov=True): + def gradients_XX_diag(self, dL_dK_diag, X): """ - Given the derivative of the objective d2L_dK, compute the second derivative of K wrt X: + Given the derivative of the objective dL_dK, compute the second derivative of K wrt X: ..math: \frac{\partial^2 K}{\partial X\partial X} ..returns: - dL2_dXdX: [NxQ], for X [NxQ] if cov is False, [NxQxQ] if cov is True + dL2_dXdX: [NxQxQ] """ - if cov: - tmp = np.ones(X.shape+(X.shape[1],)) - return tmp * (d2L_dK * self.variance/self.lengthscale**2)[:,None,None]# np.zeros(X.shape+(X.shape[1],)) - return np.ones(X.shape) * d2L_dK * self.variance/self.lengthscale**2 # np.zeros(X.shape) + dL_dK_diag = dL_dK_diag.reshape(-1, 1, 1) + assert dL_dK_diag.size == X.shape[0], "dL_dK_diag has to be given as row [N] or column vector [Nx1]" + + l2 = np.ones(X.shape[1])*self.lengthscale**2 + return (dL_dK_diag * self.variance/(l2[:,None]*l2[None,:]))# np.zeros(X.shape+(X.shape[1],)) + #return np.ones(X.shape) * d2L_dK * self.variance/self.lengthscale**2 # np.zeros(X.shape) def _gradients_X_pure(self, dL_dK, X, X2=None): invdist = self._inv_dist(X, X2) diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py index b3019de0..9971e6d6 100644 --- a/GPy/testing/kernel_tests.py +++ b/GPy/testing/kernel_tests.py @@ -104,37 +104,42 @@ class Kern_check_dKdiag_dX(Kern_check_dK_dX): def parameters_changed(self): self.X.gradient[:] = self.kernel.gradients_X_diag(self.dL_dK.diagonal(), self.X) -class Kern_check_d2K_dXdX_cov(Kern_check_model): +class Kern_check_d2K_dXdX(Kern_check_model): """This class allows gradient checks for the secondderivative of a kernel with respect to X. """ def __init__(self, kernel=None, dL_dK=None, X=None, X2=None): Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2) - self.X = Param('X',X) + self.X = Param('X',X.copy()) self.link_parameter(self.X) + self.Xc = X.copy() def log_likelihood(self): + if self.X2 is None: + return self.kernel.gradients_X(self.dL_dK, self.X, self.Xc).sum() return self.kernel.gradients_X(self.dL_dK, self.X, self.X2).sum() def parameters_changed(self): #if self.kernel.name == 'rbf': # import ipdb;ipdb.set_trace() - if self.X2 is None: X2 = self.X - else: X2 = self.X2 - grads = self.kernel.gradients_XX(self.dL_dK.T, X2, self.X, cov=True) - self.X.gradient[:] = grads.sum(-1).sum(0) + if self.X2 is None: + grads = -self.kernel.gradients_XX(self.dL_dK, self.X).sum(1).sum(1) + else: + grads = -self.kernel.gradients_XX(self.dL_dK.T, self.X2, self.X).sum(0).sum(1) + self.X.gradient[:] = grads -class Kern_check_d2Kdiag_dXdX_cov(Kern_check_model): +class Kern_check_d2Kdiag_dXdX(Kern_check_model): """This class allows gradient checks for the second derivative of a kernel with respect to X. """ - def __init__(self, kernel=None, dL_dK=None, X=None, X2=None): - Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2) + def __init__(self, kernel=None, dL_dK=None, X=None): + Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X) self.X = Param('X',X) self.link_parameter(self.X) + self.Xc = X.copy() def log_likelihood(self): - return np.sum(self.kernel.gradients_X_diag(self.dL_dK.diagonal(),self.X)) + return np.sum(self.kernel.gradients_X_diag(self.dL_dK.diagonal(), self.X)) def parameters_changed(self): - grads = self.kernel.gradients_XX_diag(self.dL_dK.diagonal(), self.X, cov=True) - self.X.gradient[:] = grads.sum(-1) + grads = self.kernel.gradients_XX_diag(self.dL_dK.diagonal(), self.X) + self.X.gradient[:] = grads.sum(-1) def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verbose=False, fixed_X_dims=None): """ @@ -273,29 +278,9 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb return False if verbose: - print("Checking gradients of dK(X, X) wrt X with full cov in dimensions") + print("Checking gradients of dK(X, X2) wrt X2 with full cov in dimensions") try: - testmodel = Kern_check_d2K_dXdX_cov(kern, X=X, X2=None) - if fixed_X_dims is not None: - testmodel.X[:,fixed_X_dims].fix() - result = testmodel.checkgrad(verbose=verbose) - except NotImplementedError: - result=True - if verbose: - print(("gradients_X not implemented for " + kern.name)) - if result and verbose: - print("Check passed.") - if not result: - print(("Gradient of dK(X, X) wrt X with full cov in dimensions failed for " + kern.name + " covariance function. Gradient values as follows:")) - testmodel.checkgrad(verbose=True) - assert(result) - pass_checks = False - return False - - if verbose: - print("Checking gradients of dK(X, X2) wrt X with full cov in dimensions") - try: - testmodel = Kern_check_d2K_dXdX_cov(kern, X=X, X2=X2) + testmodel = Kern_check_d2K_dXdX(kern, X=X, X2=X2) if fixed_X_dims is not None: testmodel.X[:,fixed_X_dims].fix() result = testmodel.checkgrad(verbose=verbose) @@ -312,10 +297,30 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb pass_checks = False return False + if verbose: + print("Checking gradients of dK(X, X) wrt X with full cov in dimensions") + try: + testmodel = Kern_check_d2K_dXdX(kern, X=X, X2=None) + if fixed_X_dims is not None: + testmodel.X[:,fixed_X_dims].fix() + result = testmodel.checkgrad(verbose=verbose) + except NotImplementedError: + result=True + if verbose: + print(("gradients_X not implemented for " + kern.name)) + if result and verbose: + print("Check passed.") + if not result: + print(("Gradient of dK(X, X) wrt X with full cov in dimensions failed for " + kern.name + " covariance function. Gradient values as follows:")) + testmodel.checkgrad(verbose=True) + assert(result) + pass_checks = False + return False + if verbose: print("Checking gradients of dKdiag(X, X) wrt X with cov in dimensions") try: - testmodel = Kern_check_d2Kdiag_dXdX_cov(kern, X=X, X2=None) + testmodel = Kern_check_d2Kdiag_dXdX(kern, X=X) if fixed_X_dims is not None: testmodel.X[:,fixed_X_dims].fix() result = testmodel.checkgrad(verbose=verbose)