diff --git a/GPy/kern/_src/independent_outputs.py b/GPy/kern/_src/independent_outputs.py index aa9dca80..6f8b7be1 100644 --- a/GPy/kern/_src/independent_outputs.py +++ b/GPy/kern/_src/independent_outputs.py @@ -8,7 +8,7 @@ import itertools def index_to_slices(index): """ - take a numpy array of integers (index) and return a nested list of slices such that the slices describe the start, stop points for each integer in the index. + take a numpy array of integers (index) and return a nested list of slices such that the slices describe the start, stop points for each integer in the index. e.g. >>> index = np.asarray([0,0,0,1,1,1,2,2,2]) @@ -79,10 +79,10 @@ class IndependentOutputs(CombinationKernel): def update_gradients_full(self,dL_dK,X,X2=None): slices = index_to_slices(X[:,self.index_dim]) - if self.single_kern: + if self.single_kern: target = np.zeros(self.kern.size) kerns = itertools.repeat(self.kern) - else: + else: kerns = self.kern target = [np.zeros(kern.size) for kern, _ in zip(kerns, slices)] def collate_grads(kern, i, dL, X, X2): @@ -94,7 +94,7 @@ class IndependentOutputs(CombinationKernel): else: slices2 = index_to_slices(X2[:,self.index_dim]) [[[collate_grads(kern, i, dL_dK[s,s2],X[s],X2[s2]) for s in slices_i] for s2 in slices_j] for i,(kern,slices_i,slices_j) in enumerate(zip(kerns,slices,slices2))] - if self.single_kern: + if self.single_kern: self.kern.gradient = target else: [kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))] @@ -104,12 +104,14 @@ class IndependentOutputs(CombinationKernel): kerns = itertools.repeat(self.kern) if self.single_kern else self.kern if X2 is None: # TODO: make use of index_to_slices + # FIXME: Broken as X is already sliced out + print "Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!" values = np.unique(X[:,self.index_dim]) slices = [X[:,self.index_dim]==i for i in values] [target.__setitem__(s, kern.gradients_X(dL_dK[s,s],X[s],None)) for kern, s in zip(kerns, slices)] #slices = index_to_slices(X[:,self.index_dim]) - #[[np.add(target[s], kern.gradients_X(dL_dK[s,s], X[s]), out=target[s]) + #[[np.add(target[s], kern.gradients_X(dL_dK[s,s], X[s]), out=target[s]) # for s in slices_i] for kern, slices_i in zip(kerns, slices)] #import ipdb;ipdb.set_trace() #[[(np.add(target[s ], kern.gradients_X(dL_dK[s ,ss],X[s ], X[ss]), out=target[s ]), diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py index 7cbd69eb..e0f6c0bc 100644 --- a/GPy/models/bayesian_gplvm.py +++ b/GPy/models/bayesian_gplvm.py @@ -24,7 +24,7 @@ class BayesianGPLVM(SparseGP_MPI): def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10, Z=None, kernel=None, inference_method=None, likelihood=None, name='bayesian gplvm', mpi_comm=None, normalizer=None, - missing_data=False, stochastic=False, batchsize=1): + missing_data=False, stochastic=False, batchsize=1, Y_metadata=None): self.logger = logging.getLogger(self.__class__.__name__) if X is None: @@ -69,6 +69,7 @@ class BayesianGPLVM(SparseGP_MPI): name=name, inference_method=inference_method, normalizer=normalizer, mpi_comm=mpi_comm, variational_prior=self.variational_prior, + Y_metadata=None ) self.link_parameter(self.X, index=0) @@ -83,7 +84,7 @@ class BayesianGPLVM(SparseGP_MPI): def parameters_changed(self): super(BayesianGPLVM,self).parameters_changed() if isinstance(self.inference_method, VarDTC_minibatch): - return + return kl_fctr = 1. self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X) diff --git a/GPy/models/gradient_checker.py b/GPy/models/gradient_checker.py index 74026f8e..c2cde834 100644 --- a/GPy/models/gradient_checker.py +++ b/GPy/models/gradient_checker.py @@ -5,6 +5,8 @@ from ..core.model import Model import itertools import numpy from ..core.parameterization import Param +np = numpy +from ..util.block_matrices import get_blocks, get_block_shapes, unblock, get_blocks_3d, get_block_shapes_3d def get_shape(x): if isinstance(x, numpy.ndarray): @@ -111,3 +113,261 @@ class GradientChecker(Model): #for name, shape in zip(self.names, self.shapes): #_param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape)))))) #return _param_names + + +class HessianChecker(GradientChecker): + + def __init__(self, f, df, ddf, x0, names=None, *args, **kwargs): + """ + :param f: Function (only used for numerical hessian gradient) + :param df: Gradient of function to check + :param ddf: Analytical gradient function + :param x0: + Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names). + Can be a list of arrays, if takes a list of arrays. This list will be passed + to f and df in the same order as given here. + If only one argument, make sure not to pass a list!!! + + :type x0: [array-like] | array-like | float | int + :param names: + Names to print, when performing gradcheck. If a list was passed to x0 + a list of names with the same length is expected. + :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs) + + """ + super(HessianChecker, self).__init__(df, ddf, x0, names=names, *args, **kwargs) + self._f = f + self._df = df + self._ddf = ddf + + def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3, block_indices=None, plot=False): + """ + Overwrite checkgrad method to check whole block instead of looping through + + Shows diagnostics using matshow instead + + :param verbose: If True, print a "full" checking of each parameter + :type verbose: bool + :param step: The size of the step around which to linearise the objective + :type step: float (default 1e-6) + :param tolerance: the tolerance allowed (see note) + :type tolerance: float (default 1e-3) + + Note:- + The gradient is considered correct if the ratio of the analytical + and numerical gradients is within of unity. + """ + try: + import numdifftools as nd + except: + raise ImportError("Don't have numdifftools package installed, it is not a GPy dependency as of yet, it is only used for hessian tests") + + if target_param: + raise NotImplementedError('Only basic functionality is provided with this gradchecker') + + #Repeat for each parameter, not the nicest but shouldn't be many cases where there are many + #variables + current_index = 0 + for name, shape in zip(self.names, self.shapes): + current_size = numpy.prod(shape) + x = self.optimizer_array.copy() + #x = self._get_params_transformed().copy() + x = x[current_index:current_index + current_size].reshape(shape) + + # Check gradients + analytic_hess = self._ddf(x) + if analytic_hess.shape[1] == 1: + analytic_hess = numpy.diagflat(analytic_hess) + + #From the docs: + #x0 : vector location + #at which to differentiate fun + #If x0 is an N x M array, then fun is assumed to be a function + #of N*M variables., thus we must have it flat, not (N,1), but just (N,) + #numeric_hess_partial = nd.Hessian(self._f, vectorized=False) + numeric_hess_partial = nd.Jacobian(self._df, vectorized=False) + #numeric_hess_partial = nd.Derivative(self._df, vectorized=True) + numeric_hess = numeric_hess_partial(x) + + check_passed = self.checkgrad_block(analytic_hess, numeric_hess, verbose=verbose, step=step, tolerance=tolerance, block_indices=block_indices, plot=plot) + current_index += current_size + return check_passed + + def checkgrad_block(self, analytic_hess, numeric_hess, verbose=False, step=1e-6, tolerance=1e-3, block_indices=None, plot=False): + """ + Checkgrad a block matrix + """ + if analytic_hess.dtype is np.dtype('object'): + #Make numeric hessian also into a block matrix + real_size = get_block_shapes(analytic_hess) + num_elements = np.sum(real_size) + if (num_elements, num_elements) == numeric_hess.shape: + #If the sizes are the same we assume they are the same + #(we have not fixed any values so the numeric is the whole hessian) + numeric_hess = get_blocks(numeric_hess, real_size) + else: + #Make a fake empty matrix and fill out the correct block + tmp_numeric_hess = get_blocks(np.zeros((num_elements, num_elements)), real_size) + tmp_numeric_hess[block_indices] = numeric_hess.copy() + numeric_hess = tmp_numeric_hess + + if block_indices is not None: + #Extract the right block + analytic_hess = analytic_hess[block_indices] + numeric_hess = numeric_hess[block_indices] + else: + #Unblock them if they are in blocks and you aren't checking a single block (checking whole hessian) + if analytic_hess.dtype is np.dtype('object'): + analytic_hess = unblock(analytic_hess) + numeric_hess = unblock(numeric_hess) + + ratio = numeric_hess / (numpy.where(analytic_hess==0, 1e-10, analytic_hess)) + difference = numpy.abs(analytic_hess - numeric_hess) + + check_passed = numpy.all((numpy.abs(1 - ratio)) < tolerance) or numpy.allclose(numeric_hess, analytic_hess, atol = tolerance) + + if verbose: + if block_indices: + print "\nBlock {}".format(block_indices) + else: + print "\nAll blocks" + + header = ['Checked', 'Max-Ratio', 'Min-Ratio', 'Min-Difference', 'Max-Difference'] + header_string = map(lambda x: ' | '.join(header), [header]) + separator = '-' * len(header_string[0]) + print '\n'.join([header_string[0], separator]) + min_r = '%.6f' % float(numpy.min(ratio)) + max_r = '%.6f' % float(numpy.max(ratio)) + max_d = '%.6f' % float(numpy.max(difference)) + min_d = '%.6f' % float(numpy.min(difference)) + cols = [max_r, min_r, min_d, max_d] + + if check_passed: + checked = "\033[92m True \033[0m" + else: + checked = "\033[91m False \033[0m" + + grad_string = "{} | {} | {} | {} | {} ".format(checked, cols[0], cols[1], cols[2], cols[3]) + print grad_string + + if plot: + import pylab as pb + fig, axes = pb.subplots(2, 2) + max_lim = numpy.max(numpy.vstack((analytic_hess, numeric_hess))) + min_lim = numpy.min(numpy.vstack((analytic_hess, numeric_hess))) + msa = axes[0,0].matshow(analytic_hess, vmin=min_lim, vmax=max_lim) + axes[0,0].set_title('Analytic hessian') + axes[0,0].xaxis.set_ticklabels([None]) + axes[0,0].yaxis.set_ticklabels([None]) + axes[0,0].xaxis.set_ticks([None]) + axes[0,0].yaxis.set_ticks([None]) + msn = axes[0,1].matshow(numeric_hess, vmin=min_lim, vmax=max_lim) + pb.colorbar(msn, ax=axes[0,1]) + axes[0,1].set_title('Numeric hessian') + axes[0,1].xaxis.set_ticklabels([None]) + axes[0,1].yaxis.set_ticklabels([None]) + axes[0,1].xaxis.set_ticks([None]) + axes[0,1].yaxis.set_ticks([None]) + msr = axes[1,0].matshow(ratio) + pb.colorbar(msr, ax=axes[1,0]) + axes[1,0].set_title('Ratio') + axes[1,0].xaxis.set_ticklabels([None]) + axes[1,0].yaxis.set_ticklabels([None]) + axes[1,0].xaxis.set_ticks([None]) + axes[1,0].yaxis.set_ticks([None]) + msd = axes[1,1].matshow(difference) + pb.colorbar(msd, ax=axes[1,1]) + axes[1,1].set_title('difference') + axes[1,1].xaxis.set_ticklabels([None]) + axes[1,1].yaxis.set_ticklabels([None]) + axes[1,1].xaxis.set_ticks([None]) + axes[1,1].yaxis.set_ticks([None]) + if block_indices: + fig.suptitle("Block: {}".format(block_indices)) + pb.show() + + return check_passed + +class SkewChecker(HessianChecker): + + def __init__(self, df, ddf, dddf, x0, names=None, *args, **kwargs): + """ + :param df: gradient of function + :param ddf: Gradient of function to check (hessian) + :param dddf: Analytical gradient function (third derivative) + :param x0: + Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names). + Can be a list of arrays, if takes a list of arrays. This list will be passed + to f and df in the same order as given here. + If only one argument, make sure not to pass a list!!! + + :type x0: [array-like] | array-like | float | int + :param names: + Names to print, when performing gradcheck. If a list was passed to x0 + a list of names with the same length is expected. + :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs) + + """ + super(SkewChecker, self).__init__(df, ddf, dddf, x0, names=names, *args, **kwargs) + + def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3, block_indices=None, plot=False, super_plot=False): + """ + Gradient checker that just checks each hessian individually + + super_plot will plot the hessian wrt every parameter, plot will just do the first one + """ + try: + import numdifftools as nd + except: + raise ImportError("Don't have numdifftools package installed, it is not a GPy dependency as of yet, it is only used for hessian tests") + + if target_param: + raise NotImplementedError('Only basic functionality is provided with this gradchecker') + + #Repeat for each parameter, not the nicest but shouldn't be many cases where there are many + #variables + current_index = 0 + for name, n_shape in zip(self.names, self.shapes): + current_size = numpy.prod(n_shape) + x = self.optimizer_array.copy() + #x = self._get_params_transformed().copy() + x = x[current_index:current_index + current_size].reshape(n_shape) + + # Check gradients + #Actually the third derivative + analytic_hess = self._ddf(x) + + #Can only calculate jacobian for one variable at a time + #From the docs: + #x0 : vector location + #at which to differentiate fun + #If x0 is an N x M array, then fun is assumed to be a function + #of N*M variables., thus we must have it flat, not (N,1), but just (N,) + #numeric_hess_partial = nd.Hessian(self._f, vectorized=False) + #Actually _df is already the hessian + numeric_hess_partial = nd.Jacobian(self._df, vectorized=True) + numeric_hess = numeric_hess_partial(x) + + print "Done making numerical hessian" + if analytic_hess.dtype is np.dtype('object'): + #Blockify numeric_hess aswell + blocksizes, pagesizes = get_block_shapes_3d(analytic_hess) + #HACK + real_block_size = np.sum(blocksizes) + numeric_hess = numeric_hess.reshape(real_block_size, real_block_size, pagesizes) + #numeric_hess = get_blocks_3d(numeric_hess, blocksizes)#, pagesizes) + else: + numeric_hess = numeric_hess.reshape(*analytic_hess.shape) + + #Check every block individually (for ease) + check_passed = [False]*numeric_hess.shape[2] + for block_ind in xrange(numeric_hess.shape[2]): + #Unless super_plot is set, just plot the first one + p = True if (plot and block_ind == numeric_hess.shape[2]-1) or super_plot else False + if verbose: + print "Checking derivative of hessian wrt parameter number {}".format(block_ind) + check_passed[block_ind] = self.checkgrad_block(analytic_hess[:,:,block_ind], numeric_hess[:,:,block_ind], verbose=verbose, step=step, tolerance=tolerance, block_indices=block_indices, plot=p) + + current_index += current_size + return np.all(check_passed) + diff --git a/GPy/util/block_matrices.py b/GPy/util/block_matrices.py index a047abc6..e1e04aaa 100644 --- a/GPy/util/block_matrices.py +++ b/GPy/util/block_matrices.py @@ -1,9 +1,37 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Copyright (c) 2014-2015, Alan Saul # Licensed under the BSD 3-clause license (see LICENSE.txt) import numpy as np +def get_blocks_3d(A, blocksizes, pagesizes=None): + """ + Given a 3d matrix, make a block matrix, where the first and second dimensions are blocked according + to blocksizes, and the pages are blocked using pagesizes + """ + assert (A.shape[0]==A.shape[1]) and len(A.shape)==3, "can't blockify this non-square matrix, may need to use 2d version" + N = np.sum(blocksizes) + assert A.shape[0] == N, "bad blocksizes" + num_blocks = len(blocksizes) + if pagesizes == None: + #Assume each page of A should be its own dimension + pagesizes = range(A.shape[2])#[0]*A.shape[2] + num_pages = len(pagesizes) + B = np.empty(shape=(num_blocks, num_blocks, num_pages), dtype=np.object) + count_k = 0 + #for Bk, k in enumerate(pagesizes): + for Bk in pagesizes: + count_i = 0 + for Bi, i in enumerate(blocksizes): + count_j = 0 + for Bj, j in enumerate(blocksizes): + #We want to have it count_k:count_k + k but its annoying as it makes a NxNx1 array is page sizes are set to 1 + B[Bi, Bj, Bk] = A[count_i:count_i + i, count_j:count_j + j, Bk] + count_j += j + count_i += i + #count_k += k + return B + def get_blocks(A, blocksizes): - assert (A.shape[0]==A.shape[1]) and len(A.shape)==2, "can;t blockify this non-square matrix" + assert (A.shape[0]==A.shape[1]) and len(A.shape)==2, "can't blockify this non-square matrix" N = np.sum(blocksizes) assert A.shape[0] == N, "bad blocksizes" num_blocks = len(blocksizes) @@ -17,6 +45,11 @@ def get_blocks(A, blocksizes): count_i += i return B +def get_block_shapes_3d(B): + assert B.dtype is np.dtype('object'), "Must be a block matrix" + #FIXME: This isn't general AT ALL... + return get_block_shapes(B[:,:,0]), B.shape[2] + def get_block_shapes(B): assert B.dtype is np.dtype('object'), "Must be a block matrix" return [B[b,b].shape[0] for b in range(0, B.shape[0])] @@ -35,7 +68,7 @@ def unblock(B): count_i += i return A -def block_dot(A, B): +def block_dot(A, B, diagonal=False): """ Element wise dot product on block matricies @@ -48,21 +81,30 @@ def block_dot(A, B): +-------------+ +------+------+ +-------+-------+ ..Note + If any block of either (A or B) are stored as 1d vectors then we assume + that it denotes a diagonal matrix efficient dot product using numpy + broadcasting will be used, i.e. A11*B11 + If either (A or B) of the diagonal matrices are stored as vectors then a more efficient dot product using numpy broadcasting will be used, i.e. A11*B11 """ #Must have same number of blocks and be a block matrix assert A.dtype is np.dtype('object'), "Must be a block matrix" assert B.dtype is np.dtype('object'), "Must be a block matrix" - Ashape = A.shape - Bshape = B.shape - assert Ashape == Bshape - def f(A,B): - if Ashape[0] == Ashape[1] or Bshape[0] == Bshape[1]: - #FIXME: Careful if one is transpose of other, would make a matrix - return A*B + assert A.shape == B.shape + def f(C,D): + """ + C is an element of A, D is the associated element of B + """ + Cshape = C.shape + Dshape = D.shape + if diagonal and (len(Cshape) == 1 or len(Dshape) == 1\ + or C.shape[0] != C.shape[1] or D.shape[0] != D.shape[1]): + print "Broadcasting, C: {} D:{}".format(C.shape, D.shape) + return C*D else: - return np.dot(A,B) + print "Dotting, C: {} C:{}".format(C.shape, D.shape) + return np.dot(C,D) dot = np.vectorize(f, otypes = [np.object]) return dot(A,B)