diff --git a/GPy/core/model.py b/GPy/core/model.py index 921d50b1..57d41602 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -260,17 +260,18 @@ class Model(Parameterized): """ positive_strings = ['variance', 'lengthscale', 'precision', 'kappa', 'sensitivity'] # param_names = self._get_param_names() - for s in positive_strings: - paramlist = self.grep_param_names(".*"+s) - for param in paramlist: - for p in param.flattened_parameters: - rav_i = set(self._raveled_index_for(p)) - for constraint in self.constraints.iter_properties(): - rav_i -= set(self._constraint_indices(p, constraint)) - rav_i -= set(np.nonzero(self._fixes_for(p)!=UNFIXED)[0]) - ind = self._backtranslate_index(p, np.array(list(rav_i), dtype=int)) - if ind.size != 0: - p[np.unravel_index(ind, p.shape)].constrain_positive() + +# for s in positive_strings: +# paramlist = self.grep_param_names(".*"+s) +# for param in paramlist: +# for p in param.flattened_parameters: +# rav_i = set(self._raveled_index_for(p)) +# for constraint in self.constraints.iter_properties(): +# rav_i -= set(self._constraint_indices(p, constraint)) +# rav_i -= set(np.nonzero(self._fixes_for(p)!=UNFIXED)[0]) +# ind = self._backtranslate_index(p, np.array(list(rav_i), dtype=int)) +# if ind.size != 0: +# p[np.unravel_index(ind, p.shape)].constrain_positive(warning=warning) # if paramlist: # self.__getitem__(None, paramlist).constrain_positive(warning=warning) # currently_constrained = self.all_constrained_indices() @@ -405,18 +406,18 @@ class Model(Parameterized): dx = step * np.sign(np.random.uniform(-1, 1, x.size)) # evaulate around the point x - f1, g1 = self.objective_and_gradients(x + dx) - f2, g2 = self.objective_and_gradients(x - dx) + f1 = self.objective_function(x + dx) + f2 = self.objective_function(x - dx) gradient = self.objective_function_gradients(x) numerical_gradient = (f1 - f2) / (2 * dx) global_ratio = (f1 - f2) / (2 * np.dot(dx, np.where(gradient==0, 1e-32, gradient))) - + return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() < tolerance) else: # check the gradient of each parameter individually, and do some pretty printing try: - names = self._get_param_names_transformed() + names = self._get_param_names() except NotImplementedError: names = ['Variable %i' % i for i in range(len(x))] # Prepare for pretty-printing @@ -430,42 +431,46 @@ class Model(Parameterized): header_string = map(lambda x: '|'.join(x), [header_string]) separator = '-' * len(header_string[0]) print '\n'.join([header_string[0], separator]) - if target_param is None: param_list = range(len(x)) else: param_list = self._raveled_index_for(target_param) if self._has_fixes(): - param_list = param_list[self._fixes_] - if not np.any(param_list): + param_list = np.intersect1d(param_list, np.r_[:self.size][self._fixes_], True) + + if param_list.size == 0: print "No free parameters to check" return gradient = self.objective_function_gradients(x) np.where(gradient==0, 1e-312, gradient) - - for i in param_list: + ret = True + for i, ind in enumerate(param_list): xx = x.copy() - xx[i] += step - f1, g1 = self.objective_and_gradients(xx) - xx[i] -= 2.*step - f2, g2 = self.objective_and_gradients(xx) + xx[ind] += step + f1 = self.objective_function(xx) + xx[ind] -= 2.*step + f2 = self.objective_function(xx) numerical_gradient = (f1 - f2) / (2 * step) - ratio = (f1 - f2) / (2 * step * gradient[i]) - difference = np.abs((f1 - f2) / 2 / step - gradient[i]) + ratio = (f1 - f2) / (2 * step * gradient[ind]) + difference = np.abs((f1 - f2) / 2 / step - gradient[ind]) if (np.abs(1. - ratio) < tolerance) or np.abs(difference) < tolerance: - formatted_name = "\033[92m {0} \033[0m".format(names[i]) + formatted_name = "\033[92m {0} \033[0m".format(names[ind]) + ret &= True else: - formatted_name = "\033[91m {0} \033[0m".format(names[i]) + formatted_name = "\033[91m {0} \033[0m".format(names[ind]) + ret &= False + r = '%.6f' % float(ratio) d = '%.6f' % float(difference) - g = '%.6f' % gradient[i] + g = '%.6f' % gradient[ind] ng = '%.6f' % float(numerical_gradient) grad_string = "{0:<{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}".format(formatted_name, r, d, g, ng, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4]) print grad_string - + return ret + def input_sensitivity(self): """ return an array describing the sesitivity of the model to each input diff --git a/GPy/core/parameterization/array_core.py b/GPy/core/parameterization/array_core.py index c95f3ce3..4b5b7700 100644 --- a/GPy/core/parameterization/array_core.py +++ b/GPy/core/parameterization/array_core.py @@ -27,6 +27,24 @@ class ParamList(list): return False pass +class C(np.ndarray): + __array_priority__ = 1. + def __new__(cls, array): + obj = array.view(cls) + return obj + #def __array_finalize__(self, obj): + # #print 'finalize' + # return obj + def __array_prepare__(self, out_arr, context): + #print 'prepare' + while type(out_arr) is C: + out_arr = out_arr.base + return out_arr + def __array_wrap__(self, out_arr, context): + #print 'wrap', type(self), type(out_arr), context + while type(out_arr) is C: + out_arr = out_arr.base + return out_arr class ObservableArray(ListArray, Observable): """ @@ -35,7 +53,7 @@ class ObservableArray(ListArray, Observable): will be called every time this array changes. The callable takes exactly one argument, which is this array itself. """ - __array_priority__ = 0 # Never give back Param + __array_priority__ = -1 # Never give back ObservableArray def __new__(cls, input_array): cls.__name__ = "ObservableArray\n " obj = super(ObservableArray, cls).__new__(cls, input_array).view(cls) diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index 772cbd42..9e4195e9 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -24,8 +24,11 @@ class Param(ObservableArray, Constrainable, Gradcheckable): """ Parameter object for GPy models. - :param name: name of the parameter to be printed - :param input_array: array which this parameter handles + :param str name: name of the parameter to be printed + :param input_array: array which this parameter handles + :type input_array: numpy.ndarray + :param default_constraint: The default constraint for this parameter + :type default_constraint: You can add/remove constraints by calling constrain on the parameter itself, e.g: @@ -40,19 +43,10 @@ class Param(ObservableArray, Constrainable, Gradcheckable): See :py:class:`GPy.core.parameterized.Parameterized` for more details on constraining etc. - This ndarray can be stored in lists and checked if it is in. - - >>> import numpy as np - >>> x = np.random.normal(size=(10,3)) - >>> x in [[1], x, [3]] - True - - WARNING: This overrides the functionality of x==y!!! - Use numpy.equal(x,y) for element-wise equality testing. """ - __array_priority__ = 0 # Never give back Param + __array_priority__ = -1 # Never give back Param _fixes_ = None - def __new__(cls, name, input_array, *args, **kwargs): + def __new__(cls, name, input_array, default_constraint=None): obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array)) obj._current_slice_ = (slice(obj.shape[0]),) obj._realshape_ = obj.shape @@ -66,8 +60,8 @@ class Param(ObservableArray, Constrainable, Gradcheckable): obj.gradient = None return obj - def __init__(self, name, input_array): - super(Param, self).__init__(name=name) + def __init__(self, name, input_array, default_constraint=None): + super(Param, self).__init__(name=name, default_constraint=default_constraint) def __array_finalize__(self, obj): # see InfoArray.__array_finalize__ for comments @@ -75,6 +69,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable): super(Param, self).__array_finalize__(obj) self._direct_parent_ = getattr(obj, '_direct_parent_', None) self._parent_index_ = getattr(obj, '_parent_index_', None) + self._default_constraint_ = getattr(obj, '_default_constraint_', None) self._current_slice_ = getattr(obj, '_current_slice_', None) self._tied_to_me_ = getattr(obj, '_tied_to_me_', None) self._tied_to_ = getattr(obj, '_tied_to_', None) @@ -97,6 +92,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable): (self.name, self._direct_parent_, self._parent_index_, + self._default_constraint_, self._current_slice_, self._realshape_, self._realsize_, @@ -117,6 +113,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable): self._realsize_ = state.pop() self._realshape_ = state.pop() self._current_slice_ = state.pop() + self._default_constraint_ = state.pop() self._parent_index_ = state.pop() self._direct_parent_ = state.pop() self.name = state.pop() diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py index 7505c796..1075d808 100644 --- a/GPy/core/parameterization/parameter_core.py +++ b/GPy/core/parameterization/parameter_core.py @@ -91,8 +91,9 @@ class Gradcheckable(Parentable): class Constrainable(Nameable): - def __init__(self, name): + def __init__(self, name, default_constraint=None): super(Constrainable,self).__init__(name) + self._default_constraint_ = default_constraint #=========================================================================== # Fixing Parameters: #=========================================================================== diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py index 746163dc..678b119b 100644 --- a/GPy/core/parameterization/parameterized.py +++ b/GPy/core/parameterization/parameterized.py @@ -171,6 +171,8 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable): for t, ind in param._constraints_.iteritems(): self.constraints.add(t, ind+self._offset_for(param)) param._constraints_.clear() + if param._default_constraint_ is not None: + self._add_constrain(param, param._default_constraint_, False) if self._has_fixes() and np.all(self._fixes_): # ==UNFIXED self._fixes_= None @@ -312,8 +314,11 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable): #=========================================================================== # Optimization handles: #=========================================================================== - def _get_param_names_transformed(self): + def _get_param_names(self): n = numpy.array([p.name_hirarchical+'['+str(i)+']' for p in self.flattened_parameters for i in p._indices()]) + return n + def _get_param_names_transformed(self): + n = self._get_param_names() if self._has_fixes(): return n[self._fixes_] return n diff --git a/GPy/core/svigp.py b/GPy/core/svigp.py index 9e4f3b12..a2c7acee 100644 --- a/GPy/core/svigp.py +++ b/GPy/core/svigp.py @@ -154,13 +154,13 @@ class SVIGP(GP): self.psi2 = None def dL_dtheta(self): - dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z) + dL_dtheta = self.kern._param_grad_helper(self.dL_dKmm, self.Z) if self.has_uncertain_inputs: dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X_batch, self.X_variance_batch) dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X_batch, self.X_variance_batch) dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X_batch, self.X_variance_batch) else: - dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X_batch, self.Z) + dL_dtheta += self.kern._param_grad_helper(self.dL_dpsi1, self.X_batch, self.Z) dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X_batch) return dL_dtheta diff --git a/GPy/inference/latent_function_inference/fitc.py b/GPy/inference/latent_function_inference/fitc.py index e4c01252..82d0973f 100644 --- a/GPy/inference/latent_function_inference/fitc.py +++ b/GPy/inference/latent_function_inference/fitc.py @@ -1,73 +1,54 @@ # Copyright (c) 2012, GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) -import numpy as np -from ...util.linalg import mdot, jitchol, chol_inv, tdot, dtrtrs -from ...core import SparseGP +class VarDTC(object): + def __init__(self): + self.const_jitter = 1e-6 -class FITC(SparseGP): - """ + def inference(self, kern, X, X_variance, Z, likelihood, Y): - Sparse FITC approximation + num_inducing, _ = Z.shape + num_data, output_dim = Y.shape - :param X: inputs - :type X: np.ndarray (num_data x Q) - :param likelihood: a likelihood instance, containing the observed data - :type likelihood: GPy.likelihood.(Gaussian | EP) - :param kernel: the kernel (covariance function). See link kernels - :type kernel: a GPy.kern.kern instance - :param Z: inducing inputs (optional, see note) - :type Z: np.ndarray (M x Q) | None - :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales) - :type normalize_(X|Y): bool + #make sure we're not using variational uncertain inputs + assert = X_variance is None, "variational inducing inputs only for use with varDTC inference" + #Note: we can;t do the variational thing after making the GITC conditional approximation because K~ appears in the log determinant. - """ + #see whether we've got a different noise variance for each datum + sigma2 = np.squeeze(likelihood.variance) + het_noise = False + if sigma2.size <1: + het_noise = True - def __init__(self, X, likelihood, kernel, Z, normalize_X=False): - SparseGP.__init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False) - assert self.output_dim == 1, "FITC model is not defined for handling multiple outputs" - - def update_likelihood_approximation(self, **kwargs): - """ - Approximates a non-Gaussian likelihood using Expectation Propagation - - For a Gaussian likelihood, no iteration is required: - this function does nothing - """ - self.likelihood.restart() - self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0, **kwargs) - self._set_params(self._get_params()) - - def _compute_kernel_matrices(self): # kernel computations, using BGPLVM notation - self.Kmm = self.kern.K(self.Z) - self.psi0 = self.kern.Kdiag(self.X) - self.psi1 = self.kern.K(self.Z, self.X) - self.psi2 = None + Kmm = kern.K(Z) + Kdiag = kern.Kdiag(X) + Kmn = kern.K(X, Z) - def _computations(self): #factor Kmm - self.Lm = jitchol(self.Kmm) - self.Lmi,info = dtrtrs(self.Lm,np.eye(self.num_inducing),lower=1) - Lmipsi1 = np.dot(self.Lmi,self.psi1) - self.Qnn = np.dot(Lmipsi1.T,Lmipsi1).copy() - self.Diag0 = self.psi0 - np.diag(self.Qnn) - self.beta_star = self.likelihood.precision/(1. + self.likelihood.precision*self.Diag0[:,None]) #NOTE: beta_star contains Diag0 and the precision - self.V_star = self.beta_star * self.likelihood.Y + Lm = jitchol(Kmm) + V = dtrtrs(Lm, Kmn, lower=1) - # The rather complex computations of self.A - tmp = self.psi1 * (np.sqrt(self.beta_star.flatten().reshape(1, self.num_data))) - tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1) - self.A = tdot(tmp) + #compute effective noise + g_sn2 = sigma2 + Kdiag - np.sum(V*V, 0) - # factor B - self.B = np.eye(self.num_inducing) + self.A - self.LB = jitchol(self.B) - self.LBi = chol_inv(self.LB) - self.psi1V = np.dot(self.psi1, self.V_star) + #compute and factor B + tmp = Kmn / np.sqrt(g_snd) + tmp, _ = dtrtrs(Lm, tmp, lower=1) + A = tdot(tmp) + B = np.eye(num_inducing) + A + Bi, Lb, LBi, log_det_B = pdinv(B) - Lmi_psi1V, info = dtrtrs(self.Lm, np.asfortranarray(self.psi1V), lower=1, trans=0) - self._LBi_Lmi_psi1V, _ = dtrtrs(self.LB, np.asfortranarray(Lmi_psi1V), lower=1, trans=0) + #compute posterior parameters + tmp, _ = dtrtrs() + woodbury_vec = + + + + + psi1V = np.dot(self.psi1, self.V_star) + Lmi_psi1V, _ = dtrtrs(Lm, self.psi1V, lower=1, trans=0) + LBi_Lmi_psi1V, _ = dtrtrs(LB, Lmi_psi1V, lower=1, trans=0) Kmmipsi1 = np.dot(self.Lmi.T,Lmipsi1) b_psi1_Ki = self.beta_star * Kmmipsi1.T @@ -116,8 +97,8 @@ class FITC(SparseGP): K_pp_K = np.dot(Kmmipsi1[:,i:(i+1)],Kmmipsi1[:,i:(i+1)].T) _dpsi1 = (-V_n**2 - alpha_n + 2.*gamma_k - gamma_n**2) * Kmmipsi1.T[i:(i+1),:] _dKmm = .5*(V_n**2 + alpha_n + gamma_n**2 - 2.*gamma_k) * K_pp_K #Diag_dD_dKmm - self._dpsi1_dtheta += self.kern.dK_dtheta(_dpsi1,self.X[i:i+1,:],self.Z) - self._dKmm_dtheta += self.kern.dK_dtheta(_dKmm,self.Z) + self._dpsi1_dtheta += self.kern._param_grad_helper(_dpsi1,self.X[i:i+1,:],self.Z) + self._dKmm_dtheta += self.kern._param_grad_helper(_dKmm,self.Z) self._dKmm_dX += self.kern.gradients_X(_dKmm ,self.Z) self._dpsi1_dX += self.kern.gradients_X(_dpsi1.T,self.Z,self.X[i:i+1,:]) @@ -163,8 +144,8 @@ class FITC(SparseGP): def dL_dtheta(self): dL_dtheta = self.kern.dKdiag_dtheta(self._dL_dpsi0,self.X) - dL_dtheta += self.kern.dK_dtheta(self._dL_dpsi1,self.X,self.Z) - dL_dtheta += self.kern.dK_dtheta(self._dL_dKmm,X=self.Z) + dL_dtheta += self.kern._param_grad_helper(self._dL_dpsi1,self.X,self.Z) + dL_dtheta += self.kern._param_grad_helper(self._dL_dKmm,X=self.Z) dL_dtheta += self._dKmm_dtheta dL_dtheta += self._dpsi1_dtheta return dL_dtheta diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py index c97807fb..98945c2d 100644 --- a/GPy/kern/kern.py +++ b/GPy/kern/kern.py @@ -8,6 +8,7 @@ from parts.prod import Prod as prod from parts.linear import Linear from parts.kernpart import Kernpart from ..core.parameterization import Parameterized +from GPy.core.parameterization.param import Param class kern(Parameterized): def __init__(self, input_dim, parts=[], input_slices=None): @@ -84,7 +85,7 @@ class kern(Parameterized): # represents the gradient in the transformed space (i.e. that given by # get_params_transformed()) # -# :param g: the gradient vector for the current model, usually created by dK_dtheta +# :param g: the gradient vector for the current model, usually created by _param_grad_helper # """ # x = self._get_params() # [np.place(g, index, g[index] * constraint.gradfactor(x[index])) @@ -294,7 +295,7 @@ class kern(Parameterized): def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z): [p.update_gradients_variational(dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z) for p in self._parameters_] - def dK_dtheta(self, dL_dK, X, X2=None): + def _param_grad_helper(self, dL_dK, X, X2=None): """ Compute the gradient of the covariance function with respect to the parameters. @@ -310,9 +311,9 @@ class kern(Parameterized): assert X.shape[1] == self.input_dim target = np.zeros(self.size) if X2 is None: - [p.dK_dtheta(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)] + [p._param_grad_helper(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)] else: - [p.dK_dtheta(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)] + [p._param_grad_helper(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)] return self._transform_gradients(target) @@ -484,6 +485,7 @@ from GPy.core.model import Model class Kern_check_model(Model): """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel.""" def __init__(self, kernel=None, dL_dK=None, X=None, X2=None): + Model.__init__(self, 'kernel_test_model') num_samples = 20 num_samples2 = 10 if kernel==None: @@ -495,14 +497,12 @@ class Kern_check_model(Model): dL_dK = np.ones((X.shape[0], X.shape[0])) else: dL_dK = np.ones((X.shape[0], X2.shape[0])) - + self.kernel=kernel + self.add_parameter(kernel) self.X = X self.X2 = X2 self.dL_dK = dL_dK - #self.constrained_indices=[] - #self.constraints=[] - Model.__init__(self, 'kernel_test_model') def is_positive_definite(self): v = np.linalg.eig(self.kernel.K(self.X))[0] @@ -511,15 +511,6 @@ class Kern_check_model(Model): else: return True - def _get_params(self): - return self.kernel._get_params() - - def _get_param_names(self): - return self.kernel._get_param_names() - - def _set_params(self, x): - self.kernel._set_params(x) - def log_likelihood(self): return (self.dL_dK*self.kernel.K(self.X, self.X2)).sum() @@ -532,7 +523,7 @@ class Kern_check_dK_dtheta(Kern_check_model): Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2) def _log_likelihood_gradients(self): - return self.kernel.dK_dtheta(self.dL_dK, self.X, self.X2) + return self.kernel._param_grad_helper(self.dL_dK, self.X, self.X2) class Kern_check_dKdiag_dtheta(Kern_check_model): """This class allows gradient checks of the gradient of the diagonal of a kernel with respect to the parameters.""" @@ -540,6 +531,8 @@ class Kern_check_dKdiag_dtheta(Kern_check_model): Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None) if dL_dK==None: self.dL_dK = np.ones((self.X.shape[0])) + def parameters_changed(self): + self.kernel.update_gradients_full(self.dL_dK, self.X) def log_likelihood(self): return (self.dL_dK*self.kernel.Kdiag(self.X)).sum() @@ -551,41 +544,25 @@ class Kern_check_dK_dX(Kern_check_model): """This class allows gradient checks for the gradient of a kernel with respect to X. """ def __init__(self, kernel=None, dL_dK=None, X=None, X2=None): Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2) - + self.remove_parameter(kernel) + self.X = Param('X', self.X) + self.add_parameter(self.X) def _log_likelihood_gradients(self): return self.kernel.gradients_X(self.dL_dK, self.X, self.X2).flatten() - def _get_param_names(self): - return ['X_' +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])] - - def _get_params(self): - return self.X.flatten() - - def _set_params(self, x): - self.X=x.reshape(self.X.shape) - -class Kern_check_dKdiag_dX(Kern_check_model): +class Kern_check_dKdiag_dX(Kern_check_dK_dX): """This class allows gradient checks for the gradient of a kernel diagonal with respect to X. """ def __init__(self, kernel=None, dL_dK=None, X=None, X2=None): - Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None) + Kern_check_dK_dX.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None) if dL_dK==None: self.dL_dK = np.ones((self.X.shape[0])) - + def log_likelihood(self): return (self.dL_dK*self.kernel.Kdiag(self.X)).sum() def _log_likelihood_gradients(self): return self.kernel.dKdiag_dX(self.dL_dK, self.X).flatten() - def _get_param_names(self): - return ['X_' +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])] - - def _get_params(self): - return self.X.flatten() - - def _set_params(self, x): - self.X=x.reshape(self.X.shape) - def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False): """ This function runs on kernels to check the correctness of their diff --git a/GPy/kern/parts/Brownian.py b/GPy/kern/parts/Brownian.py index 17f65cbd..488e9b7a 100644 --- a/GPy/kern/parts/Brownian.py +++ b/GPy/kern/parts/Brownian.py @@ -43,7 +43,7 @@ class Brownian(Kernpart): def Kdiag(self,X,target): target += self.variance*X.flatten() - def dK_dtheta(self,dL_dK,X,X2,target): + def _param_grad_helper(self,dL_dK,X,X2,target): if X2 is None: X2 = X target += np.sum(np.fmin(X,X2.T)*dL_dK) diff --git a/GPy/kern/parts/Matern32.py b/GPy/kern/parts/Matern32.py index a95f0bcf..08fa452c 100644 --- a/GPy/kern/parts/Matern32.py +++ b/GPy/kern/parts/Matern32.py @@ -74,7 +74,7 @@ class Matern32(Kernpart): """Compute the diagonal of the covariance matrix associated to X.""" np.add(target, self.variance, target) - def dK_dtheta(self, dL_dK, X, X2, target): + def _param_grad_helper(self, dL_dK, X, X2, target): """derivative of the covariance matrix with respect to the parameters.""" if X2 is None: X2 = X dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1)) diff --git a/GPy/kern/parts/Matern52.py b/GPy/kern/parts/Matern52.py index 1f87fefb..7d36254c 100644 --- a/GPy/kern/parts/Matern52.py +++ b/GPy/kern/parts/Matern52.py @@ -74,7 +74,7 @@ class Matern52(Kernpart): """Compute the diagonal of the covariance matrix associated to X.""" np.add(target,self.variance,target) - def dK_dtheta(self,dL_dK,X,X2,target): + def _param_grad_helper(self,dL_dK,X,X2,target): """derivative of the covariance matrix with respect to the parameters.""" if X2 is None: X2 = X dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1)) diff --git a/GPy/kern/parts/ODE_1.py b/GPy/kern/parts/ODE_1.py index 416278e3..15faf108 100644 --- a/GPy/kern/parts/ODE_1.py +++ b/GPy/kern/parts/ODE_1.py @@ -90,7 +90,7 @@ class ODE_1(Kernpart): np.add(self.varianceU*self.varianceY*(k1+k2+k3), target, target) - def dK_dtheta(self, dL_dK, X, X2, target): + def _param_grad_helper(self, dL_dK, X, X2, target): """derivative of the covariance matrix with respect to the parameters.""" if X2 is None: X2 = X dist = np.abs(X - X2.T) diff --git a/GPy/kern/parts/eq_ode1.py b/GPy/kern/parts/eq_ode1.py index 85bb6379..bf0ca7e4 100644 --- a/GPy/kern/parts/eq_ode1.py +++ b/GPy/kern/parts/eq_ode1.py @@ -124,7 +124,7 @@ class Eq_ode1(Kernpart): #target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()] pass - def dK_dtheta(self,dL_dK,X,X2,target): + def _param_grad_helper(self,dL_dK,X,X2,target): # First extract times and indices. self._extract_t_indices(X, X2, dL_dK=dL_dK) diff --git a/GPy/kern/parts/exponential.py b/GPy/kern/parts/exponential.py index 7cd92aff..372d4d9b 100644 --- a/GPy/kern/parts/exponential.py +++ b/GPy/kern/parts/exponential.py @@ -75,7 +75,7 @@ class Exponential(Kernpart): """Compute the diagonal of the covariance matrix associated to X.""" np.add(target, self.variance, target) - def dK_dtheta(self, dL_dK, X, X2, target): + def _param_grad_helper(self, dL_dK, X, X2, target): """derivative of the covariance matrix with respect to the parameters.""" if X2 is None: X2 = X dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1)) diff --git a/GPy/kern/parts/finite_dimensional.py b/GPy/kern/parts/finite_dimensional.py index 6cc2325f..9e54fb34 100644 --- a/GPy/kern/parts/finite_dimensional.py +++ b/GPy/kern/parts/finite_dimensional.py @@ -50,7 +50,7 @@ class FiniteDimensional(Kernpart): def Kdiag(self,X,target): product = np.diag(self.K(X, X)) np.add(target,product,target) - def dK_dtheta(self,X,X2,target): + def _param_grad_helper(self,X,X2,target): """Return shape is NxMx(Ntheta)""" if X2 is None: X2 = X FX = np.column_stack([f(X) for f in self.F]) diff --git a/GPy/kern/parts/fixed.py b/GPy/kern/parts/fixed.py index dd5bdb85..680f0b14 100644 --- a/GPy/kern/parts/fixed.py +++ b/GPy/kern/parts/fixed.py @@ -31,7 +31,7 @@ class Fixed(Kernpart): def K(self, X, X2, target): target += self.variance * self.fixed_K - def dK_dtheta(self, partial, X, X2, target): + def _param_grad_helper(self, partial, X, X2, target): target += (partial * self.fixed_K).sum() def gradients_X(self, partial, X, X2, target): diff --git a/GPy/kern/parts/gibbs.py b/GPy/kern/parts/gibbs.py index 717703ce..68241245 100644 --- a/GPy/kern/parts/gibbs.py +++ b/GPy/kern/parts/gibbs.py @@ -85,7 +85,7 @@ class Gibbs(Kernpart): """Compute the diagonal of the covariance matrix for X.""" np.add(target, self.variance, target) - def dK_dtheta(self, dL_dK, X, X2, target): + def _param_grad_helper(self, dL_dK, X, X2, target): """Derivative of the covariance with respect to the parameters.""" self._K_computations(X, X2) self._dK_computations(dL_dK) diff --git a/GPy/kern/parts/hetero.py b/GPy/kern/parts/hetero.py index f48dddb4..507f6251 100644 --- a/GPy/kern/parts/hetero.py +++ b/GPy/kern/parts/hetero.py @@ -80,7 +80,7 @@ class Hetero(Kernpart): """Helper function for computing the diagonal elements of the covariance.""" return self.mapping.f(X).flatten()**2 - def dK_dtheta(self, dL_dK, X, X2, target): + def _param_grad_helper(self, dL_dK, X, X2, target): """Derivative of the covariance with respect to the parameters.""" if (X2 is None) or (X2 is X): dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1] diff --git a/GPy/kern/parts/hierarchical.py b/GPy/kern/parts/hierarchical.py index 43dddd2d..3ca6b444 100644 --- a/GPy/kern/parts/hierarchical.py +++ b/GPy/kern/parts/hierarchical.py @@ -50,9 +50,9 @@ class Hierarchical(Kernpart): #X,slices = X[:,:-1],index_to_slices(X[:,-1]) #[[self.k.Kdiag(X[s],target[s]) for s in slices_i] for slices_i in slices] - def dK_dtheta(self,dL_dK,X,X2,target): + def _param_grad_helper(self,dL_dK,X,X2,target): X, X2, slices, slices2 = self._sort_slices(X,X2) - [[[[k.dK_dtheta(dL_dK[s,s2],X[s],X2[s2],target[p_start:p_stop]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_, slices2_)] for k, p_start, p_stop, slices_, slices2_ in zip(self.parts, self.param_starts, self.param_stops, slices, slices2)] + [[[[k._param_grad_helper(dL_dK[s,s2],X[s],X2[s2],target[p_start:p_stop]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_, slices2_)] for k, p_start, p_stop, slices_, slices2_ in zip(self.parts, self.param_starts, self.param_stops, slices, slices2)] def gradients_X(self,dL_dK,X,X2,target): diff --git a/GPy/kern/parts/independent_outputs.py b/GPy/kern/parts/independent_outputs.py index 8c0959c5..98f1203d 100644 --- a/GPy/kern/parts/independent_outputs.py +++ b/GPy/kern/parts/independent_outputs.py @@ -70,13 +70,13 @@ class IndependentOutputs(Kernpart): X,slices = X[:,:-1],index_to_slices(X[:,-1]) [[self.k.Kdiag(X[s],target[s]) for s in slices_i] for slices_i in slices] - def dK_dtheta(self,dL_dK,X,X2,target): + def _param_grad_helper(self,dL_dK,X,X2,target): X,slices = X[:,:-1],index_to_slices(X[:,-1]) if X2 is None: X2,slices2 = X,slices else: X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1]) - [[[self.k.dK_dtheta(dL_dK[s,s2],X[s],X2[s2],target) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)] + [[[self.k._param_grad_helper(dL_dK[s,s2],X[s],X2[s2],target) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)] def gradients_X(self,dL_dK,X,X2,target): diff --git a/GPy/kern/parts/kernpart.py b/GPy/kern/parts/kernpart.py index 2583d525..06f1446b 100644 --- a/GPy/kern/parts/kernpart.py +++ b/GPy/kern/parts/kernpart.py @@ -75,14 +75,14 @@ class Kernpart(Parameterized): raise NotImplementedError def Kdiag(self,X,target): raise NotImplementedError - def dK_dtheta(self,dL_dK,X,X2,target): + def _param_grad_helper(self,dL_dK,X,X2,target): raise NotImplementedError def dKdiag_dtheta(self,dL_dKdiag,X,target): - # In the base case compute this by calling dK_dtheta. Need to + # In the base case compute this by calling _param_grad_helper. Need to # override for stationary covariances (for example) to save # time. for i in range(X.shape[0]): - self.dK_dtheta(dL_dKdiag[i], X[i, :][None, :], X2=None, target=target) + self._param_grad_helper(dL_dKdiag[i], X[i, :][None, :], X2=None, target=target) def psi0(self,Z,mu,S,target): raise NotImplementedError def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target): @@ -109,8 +109,15 @@ class Kernpart(Parameterized): raise NotImplementedError def dKdiag_dX(self, dL_dK, X, target): raise NotImplementedError - - + def update_gradients_full(self, dL_dK, X): + """Set the gradients of all parameters when doing full (N) inference.""" + raise NotImplementedError + def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z): + """Set the gradients of all parameters when doing sparse (M) inference.""" + raise NotImplementedError + def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z): + """Set the gradients of all parameters when doing variational (M) inference with uncertain inputs.""" + raise NotImplementedError class Kernpart_stationary(Kernpart): def __init__(self, input_dim, lengthscale=None, ARD=False): diff --git a/GPy/kern/parts/linear.py b/GPy/kern/parts/linear.py index 6ead4549..1b805219 100644 --- a/GPy/kern/parts/linear.py +++ b/GPy/kern/parts/linear.py @@ -8,6 +8,7 @@ from kernpart import Kernpart from ...util.linalg import tdot from ...util.misc import fast_array_equal, param_to_array from ...core.parameterization import Param +from ...core.parameterization.transformations import Logexp class Linear(Kernpart): """ @@ -43,8 +44,9 @@ class Linear(Kernpart): else: variances = np.ones(self.input_dim) - self.variances = Param('variances', variances) - self.add_parameters(self.variances) + self.variances = Param('variances', variances, Logexp()) + self.variances.gradient = np.zeros(self.variances.shape) + self.add_parameter(self.variances) self.variances.add_observer(self, self.update_variance) # initialize cache @@ -57,42 +59,35 @@ class Linear(Kernpart): def on_input_change(self, X): self._K_computations(X, None) + def update_gradients_full(self, dL_dK, X): + #self.variances.gradient[:] = 0 + self._param_grad_helper(dL_dK, X, self.variances.gradient) + + def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z): + tmp = dL_dKdiag[:, None] * X ** 2 + if self.ARD: + self.variances.gradient = tmp.sum(0) + else: + self.variances.gradient = tmp.sum() + self._param_grad_helper(dL_dKmm, Z, None, self.variances.gradient) + self._param_grad_helper(dL_dKnm, X, Z, self.variances.gradient) + def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z): self._psi_computations(Z, mu, S) - # psi0: tmp = dL_dpsi0[:, None] * self.mu2_S - if self.ARD: self.variances.gradient = tmp.sum(0) - else: self.variances.gradient = tmp.sum() - + if self.ARD: self.variances.gradient[:] = tmp.sum(0) + else: self.variances.gradient[:] = tmp.sum() #psi1 - self.dK_dtheta(dL_dpsi1, mu, Z, self.variances.gradient) - - #from psi2 + self._param_grad_helper(dL_dpsi1, mu, Z, self.variances.gradient) + #psi2 tmp = dL_dpsi2[:, :, :, None] * (self.ZAinner[:, :, None, :] * (2 * Z)[None, None, :, :]) if self.ARD: self.variances.gradient += tmp.sum(0).sum(0).sum(0) else: self.variances.gradient += tmp.sum() - #from Kmm self._K_computations(Z, None) - self.dK_dtheta(dL_dKmm, Z, None, self.variances.gradient) + self._param_grad_helper(dL_dKmm, Z, None, self.variances.gradient) - -# def _get_params(self): -# return self.variances -# -# def _set_params(self, x): -# assert x.size == (self.num_params) -# self.variances = x - #def parameters_changed(self): - # self.variances2 = np.square(self.variances) -# -# def _get_param_names(self): -# if self.num_params == 1: -# return ['variance'] -# else: -# return ['variance_%i' % i for i in range(self.variances.size)] - def K(self, X, X2, target): if self.ARD: XX = X * np.sqrt(self.variances) @@ -109,7 +104,7 @@ class Linear(Kernpart): def Kdiag(self, X, target): np.add(target, np.sum(self.variances * np.square(X), -1), target) - def dK_dtheta(self, dL_dK, X, X2, target): + def _param_grad_helper(self, dL_dK, X, X2, target): if self.ARD: if X2 is None: [np.add(target[i:i + 1], np.sum(dL_dK * tdot(X[:, i:i + 1])), target[i:i + 1]) for i in range(self.input_dim)] @@ -121,13 +116,6 @@ class Linear(Kernpart): self._K_computations(X, X2) target += np.sum(self._dot_product * dL_dK) - def dKdiag_dtheta(self, dL_dKdiag, X, target): - tmp = dL_dKdiag[:, None] * X ** 2 - if self.ARD: - target += tmp.sum(0) - else: - target += tmp.sum() - def gradients_X(self, dL_dK, X, X2, target): if X2 is None: target += 2*(((X[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1) @@ -145,14 +133,6 @@ class Linear(Kernpart): self._psi_computations(Z, mu, S) target += np.sum(self.variances * self.mu2_S, 1) - def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S, target): - self._psi_computations(Z, mu, S) - tmp = dL_dpsi0[:, None] * self.mu2_S - if self.ARD: - target += tmp.sum(0) - else: - target += tmp.sum() - def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S): target_mu += dL_dpsi0[:, None] * (2.0 * mu * self.variances) target_S += dL_dpsi0[:, None] * self.variances @@ -161,10 +141,6 @@ class Linear(Kernpart): """the variance, it does nothing""" self._psi1 = self.K(mu, Z, target) - def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target): - """the variance, it does nothing""" - self.dK_dtheta(dL_dpsi1, mu, Z, target) - def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S): """Do nothing for S, it does not affect psi1""" self._psi_computations(Z, mu, S) @@ -185,21 +161,13 @@ class Linear(Kernpart): def dpsi2_dtheta_new(self, dL_dpsi2, Z, mu, S, target): tmp = np.zeros((mu.shape[0], Z.shape[0])) self.K(mu,Z,tmp) - self.dK_dtheta(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target) + self._param_grad_helper(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target) result= 2.*(dL_dpsi2[:,:,:,None]*S[:,None,None,:]*self.variances*Z[None,:,None,:]*Z[None,None,:,:]).sum(0).sum(0).sum(0) if self.ARD: target += result.sum(0).sum(0).sum(0) else: target += result.sum() - def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target): - self._psi_computations(Z, mu, S) - tmp = dL_dpsi2[:, :, :, None] * (self.ZAinner[:, :, None, :] * (2 * Z)[None, None, :, :]) - if self.ARD: - target += tmp.sum(0).sum(0).sum(0) - else: - target += tmp.sum() - def dpsi2_dmuS_new(self, dL_dpsi2, Z, mu, S, target_mu, target_S): tmp = np.zeros((mu.shape[0], Z.shape[0])) self.K(mu,Z,tmp) @@ -309,11 +277,11 @@ class Linear(Kernpart): if not (fast_array_equal(X, self._X) and fast_array_equal(X2, self._X2)): self._X = X.copy() if X2 is None: - self._dot_product = tdot(X) + self._dot_product = tdot(param_to_array(X)) self._X2 = None else: self._X2 = X2.copy() - self._dot_product = np.dot(X, X2.T) + self._dot_product = np.dot(param_to_array(X), param_to_array(X2.T)) def _psi_computations(self, Z, mu, S): # here are the "statistics" for psi1 and psi2 diff --git a/GPy/kern/parts/mlp.py b/GPy/kern/parts/mlp.py index 2ba25802..59979a62 100644 --- a/GPy/kern/parts/mlp.py +++ b/GPy/kern/parts/mlp.py @@ -77,7 +77,7 @@ class MLP(Kernpart): self._K_diag_computations(X) target+= self.variance*self._K_diag_dvar - def dK_dtheta(self, dL_dK, X, X2, target): + def _param_grad_helper(self, dL_dK, X, X2, target): """Derivative of the covariance with respect to the parameters.""" self._K_computations(X, X2) denom3 = self._K_denom*self._K_denom*self._K_denom diff --git a/GPy/kern/parts/periodic_Matern32.py b/GPy/kern/parts/periodic_Matern32.py index 0de57f82..24ec45f9 100644 --- a/GPy/kern/parts/periodic_Matern32.py +++ b/GPy/kern/parts/periodic_Matern32.py @@ -112,7 +112,7 @@ class PeriodicMatern32(Kernpart): np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target) @silence_errors - def dK_dtheta(self,dL_dK,X,X2,target): + def _param_grad_helper(self,dL_dK,X,X2,target): """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)""" if X2 is None: X2 = X FX = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X) diff --git a/GPy/kern/parts/periodic_Matern52.py b/GPy/kern/parts/periodic_Matern52.py index 882084fd..1f9d90b3 100644 --- a/GPy/kern/parts/periodic_Matern52.py +++ b/GPy/kern/parts/periodic_Matern52.py @@ -114,7 +114,7 @@ class PeriodicMatern52(Kernpart): np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target) @silence_errors - def dK_dtheta(self,dL_dK,X,X2,target): + def _param_grad_helper(self,dL_dK,X,X2,target): """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)""" if X2 is None: X2 = X FX = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X) diff --git a/GPy/kern/parts/periodic_exponential.py b/GPy/kern/parts/periodic_exponential.py index d8c193e0..4562cd56 100644 --- a/GPy/kern/parts/periodic_exponential.py +++ b/GPy/kern/parts/periodic_exponential.py @@ -110,7 +110,7 @@ class PeriodicExponential(Kernpart): np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target) @silence_errors - def dK_dtheta(self,dL_dK,X,X2,target): + def _param_grad_helper(self,dL_dK,X,X2,target): """derivative of the covariance matrix with respect to the parameters (shape is N x num_inducing x num_params)""" if X2 is None: X2 = X FX = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X) diff --git a/GPy/kern/parts/poly.py b/GPy/kern/parts/poly.py index 80abab60..0deb11f4 100644 --- a/GPy/kern/parts/poly.py +++ b/GPy/kern/parts/poly.py @@ -86,7 +86,7 @@ class POLY(Kernpart): self._K_diag_computations(X) target+= self.variance*self._K_diag_dvar - def dK_dtheta(self, dL_dK, X, X2, target): + def _param_grad_helper(self, dL_dK, X, X2, target): """Derivative of the covariance with respect to the parameters.""" self._K_computations(X, X2) base = self.variance*self.degree*self._K_poly_arg**(self.degree-1) diff --git a/GPy/kern/parts/prod.py b/GPy/kern/parts/prod.py index 07286a82..364c91b3 100644 --- a/GPy/kern/parts/prod.py +++ b/GPy/kern/parts/prod.py @@ -54,15 +54,15 @@ class Prod(Kernpart): self.k1.update_gradients_full(dL_dK*self._K2, X[:,self.slice1]) self.k2.update_gradients_full(dL_dK*self._K1, X[:,self.slice2]) - def dK_dtheta(self,dL_dK,X,X2,target): + def _param_grad_helper(self,dL_dK,X,X2,target): """Derivative of the covariance matrix with respect to the parameters.""" self._K_computations(X,X2) if X2 is None: - self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], None, target[:self.k1.num_params]) - self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.slice2], None, target[self.k1.num_params:]) + self.k1._param_grad_helper(dL_dK*self._K2, X[:,self.slice1], None, target[:self.k1.num_params]) + self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.slice2], None, target[self.k1.num_params:]) else: - self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:self.k1.num_params]) - self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[self.k1.num_params:]) + self.k1._param_grad_helper(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:self.k1.num_params]) + self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[self.k1.num_params:]) def Kdiag(self,X,target): """Compute the diagonal of the covariance matrix associated to X.""" diff --git a/GPy/kern/parts/prod_orthogonal.py b/GPy/kern/parts/prod_orthogonal.py index f8d1c3b2..e7dd1fdc 100644 --- a/GPy/kern/parts/prod_orthogonal.py +++ b/GPy/kern/parts/prod_orthogonal.py @@ -41,15 +41,15 @@ class prod_orthogonal(Kernpart): self._K_computations(X,X2) target += self._K1 * self._K2 - def dK_dtheta(self,dL_dK,X,X2,target): + def _param_grad_helper(self,dL_dK,X,X2,target): """derivative of the covariance matrix with respect to the parameters.""" self._K_computations(X,X2) if X2 is None: - self.k1.dK_dtheta(dL_dK*self._K2, X[:,:self.k1.input_dim], None, target[:self.k1.num_params]) - self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.k1.input_dim:], None, target[self.k1.num_params:]) + self.k1._param_grad_helper(dL_dK*self._K2, X[:,:self.k1.input_dim], None, target[:self.k1.num_params]) + self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.k1.input_dim:], None, target[self.k1.num_params:]) else: - self.k1.dK_dtheta(dL_dK*self._K2, X[:,:self.k1.input_dim], X2[:,:self.k1.input_dim], target[:self.k1.num_params]) - self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.k1.input_dim:], X2[:,self.k1.input_dim:], target[self.k1.num_params:]) + self.k1._param_grad_helper(dL_dK*self._K2, X[:,:self.k1.input_dim], X2[:,:self.k1.input_dim], target[:self.k1.num_params]) + self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.k1.input_dim:], X2[:,self.k1.input_dim:], target[self.k1.num_params:]) def Kdiag(self,X,target): """Compute the diagonal of the covariance matrix associated to X.""" diff --git a/GPy/kern/parts/rational_quadratic.py b/GPy/kern/parts/rational_quadratic.py index bd623320..c36cee9f 100644 --- a/GPy/kern/parts/rational_quadratic.py +++ b/GPy/kern/parts/rational_quadratic.py @@ -52,7 +52,7 @@ class RationalQuadratic(Kernpart): def Kdiag(self,X,target): target += self.variance - def dK_dtheta(self,dL_dK,X,X2,target): + def _param_grad_helper(self,dL_dK,X,X2,target): if X2 is None: X2 = X dist2 = np.square((X-X2.T)/self.lengthscale) diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py index e7bc8624..4b38bd0f 100644 --- a/GPy/kern/parts/rbf.py +++ b/GPy/kern/parts/rbf.py @@ -8,6 +8,7 @@ from kernpart import Kernpart from ...util.linalg import tdot from ...util.misc import fast_array_equal, param_to_array from ...core.parameterization import Param +from ...core.parameterization.transformations import Logexp class RBF(Kernpart): """ @@ -50,7 +51,7 @@ class RBF(Kernpart): else: lengthscale = np.ones(self.input_dim) - self.variance = Param('variance', variance) + self.variance = Param('variance', variance, Logexp()) self.lengthscale = Param('lengthscale', lengthscale) self.lengthscale.add_observer(self, self.update_lengthscale) @@ -141,7 +142,7 @@ class RBF(Kernpart): d_length = self._psi1[:,:,None] * ((self._psi1_dist_sq - 1.)/(self.lengthscale*self._psi1_denom) +1./self.lengthscale) dpsi1_dlength = d_length * dL_dpsi1[:, :, None] if not self.ARD: - self.lengthscale.gradeint = dpsi1_dlength.sum() + self.lengthscale.gradient = dpsi1_dlength.sum() else: self.lengthscale.gradient = dpsi1_dlength.sum(0).sum(0) diff --git a/GPy/kern/parts/rbf_inv.py b/GPy/kern/parts/rbf_inv.py index 0c0168a6..8405ae84 100644 --- a/GPy/kern/parts/rbf_inv.py +++ b/GPy/kern/parts/rbf_inv.py @@ -97,7 +97,7 @@ class RBFInv(RBF): # return ['variance'] + ['inv_lengthscale%i' % i for i in range(self.inv_lengthscale.size)] # TODO: Rewrite computations so that lengthscale is not needed (but only inv. lengthscale) - def dK_dtheta(self, dL_dK, X, X2, target): + def _param_grad_helper(self, dL_dK, X, X2, target): self._K_computations(X, X2) target[0] += np.sum(self._K_dvar * dL_dK) if self.ARD: diff --git a/GPy/kern/parts/rbfcos.py b/GPy/kern/parts/rbfcos.py index fc4a376a..9a4b8ab2 100644 --- a/GPy/kern/parts/rbfcos.py +++ b/GPy/kern/parts/rbfcos.py @@ -73,7 +73,7 @@ class RBFCos(Kernpart): def Kdiag(self,X,target): np.add(target,self.variance,target) - def dK_dtheta(self,dL_dK,X,X2,target): + def _param_grad_helper(self,dL_dK,X,X2,target): self._K_computations(X,X2) target[0] += np.sum(dL_dK*self._dvar) if self.ARD: diff --git a/GPy/kern/parts/spline.py b/GPy/kern/parts/spline.py index c31258f3..3e57b8a9 100644 --- a/GPy/kern/parts/spline.py +++ b/GPy/kern/parts/spline.py @@ -50,7 +50,7 @@ class Spline(Kernpart): def Kdiag(self,X,target): target += self.variance*X.flatten()**3/3. - def dK_dtheta(self,X,X2,target): + def _param_grad_helper(self,X,X2,target): target += 0.5*(t*s**2) - s**3/6. + (s_t)**3*theta(s_t)/6. def dKdiag_dtheta(self,X,target): diff --git a/GPy/kern/parts/symmetric.py b/GPy/kern/parts/symmetric.py index ef9a8dd5..8eec2acc 100644 --- a/GPy/kern/parts/symmetric.py +++ b/GPy/kern/parts/symmetric.py @@ -40,7 +40,7 @@ class Symmetric(Kernpart): self.k.K(X,AX2,target) self.k.K(AX,AX2,target) - def dK_dtheta(self,dL_dK,X,X2,target): + def _param_grad_helper(self,dL_dK,X,X2,target): """derivative of the covariance matrix with respect to the parameters.""" AX = np.dot(X,self.transform) if X2 is None: @@ -48,10 +48,10 @@ class Symmetric(Kernpart): ZX2 = AX else: AX2 = np.dot(X2, self.transform) - self.k.dK_dtheta(dL_dK,X,X2,target) - self.k.dK_dtheta(dL_dK,AX,X2,target) - self.k.dK_dtheta(dL_dK,X,AX2,target) - self.k.dK_dtheta(dL_dK,AX,AX2,target) + self.k._param_grad_helper(dL_dK,X,X2,target) + self.k._param_grad_helper(dL_dK,AX,X2,target) + self.k._param_grad_helper(dL_dK,X,AX2,target) + self.k._param_grad_helper(dL_dK,AX,AX2,target) def gradients_X(self,dL_dK,X,X2,target): diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py index 46f975d2..a09d4bfc 100644 --- a/GPy/kern/parts/sympykern.py +++ b/GPy/kern/parts/sympykern.py @@ -348,7 +348,7 @@ class spkern(Kernpart): def Kdiag(self,X,target): self._weave_inline(self._Kdiag_code, X, target) - def dK_dtheta(self,partial,X,Z,target): + def _param_grad_helper(self,partial,X,Z,target): if Z is None: self._weave_inline(self._dK_dtheta_code_X, X, target, Z, partial) else: diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py index e6be2261..071fe62d 100644 --- a/GPy/likelihoods/gaussian.py +++ b/GPy/likelihoods/gaussian.py @@ -3,7 +3,7 @@ #TODO """ -A lot of this code assumes that the link functio nis the identity. +A lot of this code assumes that the link function is the identity. I think laplace code is okay, but I'm quite sure that the EP moments will only work if the link is identity. diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py index 0fceac60..40cd66dd 100644 --- a/GPy/testing/kernel_tests.py +++ b/GPy/testing/kernel_tests.py @@ -5,7 +5,7 @@ import unittest import numpy as np import GPy -verbose = False +verbose = True try: import sympy diff --git a/GPy/testing/psi_stat_gradient_tests.py b/GPy/testing/psi_stat_gradient_tests.py index e373aaa3..56586d3b 100644 --- a/GPy/testing/psi_stat_gradient_tests.py +++ b/GPy/testing/psi_stat_gradient_tests.py @@ -9,42 +9,44 @@ import numpy import GPy import itertools from GPy.core import Model +from GPy.core.parameterization.param import Param +from GPy.core.parameterization.transformations import Logexp class PsiStatModel(Model): def __init__(self, which, X, X_variance, Z, num_inducing, kernel): + super(PsiStatModel, self).__init__(name='psi stat test') self.which = which - self.X = X - self.X_variance = X_variance - self.Z = Z + self.X = Param("X", X) + self.X_variance = Param('X_variance', X_variance, Logexp()) + self.Z = Param("Z", Z) self.N, self.input_dim = X.shape self.num_inducing, input_dim = Z.shape assert self.input_dim == input_dim, "shape missmatch: Z:{!s} X:{!s}".format(Z.shape, X.shape) self.kern = kernel - super(PsiStatModel, self).__init__() self.psi_ = self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance) - def _get_param_names(self): - Xnames = ["{}_{}_{}".format(what, i, j) for what, i, j in itertools.product(['X', 'X_variance'], range(self.N), range(self.input_dim))] - Znames = ["Z_{}_{}".format(i, j) for i, j in itertools.product(range(self.num_inducing), range(self.input_dim))] - return Xnames + Znames + self.kern._get_param_names() - def _get_params(self): - return numpy.hstack([self.X.flatten(), self.X_variance.flatten(), self.Z.flatten(), self.kern._get_params()]) - def _set_params(self, x, save_old=True, save_count=0): - start, end = 0, self.X.size - self.X = x[start:end].reshape(self.N, self.input_dim) - start, end = end, end + self.X_variance.size - self.X_variance = x[start: end].reshape(self.N, self.input_dim) - start, end = end, end + self.Z.size - self.Z = x[start: end].reshape(self.num_inducing, self.input_dim) - self.kern._set_params(x[end:]) + self.add_parameters(self.X, self.X_variance, self.Z, self.kern) + def log_likelihood(self): return self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance).sum() - def _log_likelihood_gradients(self): + + def parameters_changed(self): psimu, psiS = self.kern.__getattribute__("d" + self.which + "_dmuS")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance) + self.X.gradient = psimu + self.X_variance.gradient = psiS #psimu, psiS = numpy.ones(self.N * self.input_dim), numpy.ones(self.N * self.input_dim) - psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance) + try: psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance) + except AttributeError: psiZ = numpy.zeros_like(self.Z) + self.Z.gradient = psiZ #psiZ = numpy.ones(self.num_inducing * self.input_dim) - thetagrad = self.kern.__getattribute__("d" + self.which + "_dtheta")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance).flatten() - return numpy.hstack((psimu.flatten(), psiS.flatten(), psiZ.flatten(), thetagrad)) + N,M = self.X.shape[0], self.Z.shape[0] + dL_dpsi0, dL_dpsi1, dL_dpsi2 = numpy.zeros([N]), numpy.zeros([N,M]), numpy.zeros([N,M,M]) + if self.which == 'psi0': dL_dpsi0 += 1 + if self.which == 'psi1': dL_dpsi1 += 1 + if self.which == 'psi2': dL_dpsi2 += 1 + self.kern.update_gradients_variational(numpy.zeros([1,1]), + dL_dpsi0, + dL_dpsi1, + dL_dpsi2, self.X, self.X_variance, self.Z) class DPsiStatTest(unittest.TestCase): input_dim = 5 @@ -57,61 +59,66 @@ class DPsiStatTest(unittest.TestCase): Y = X.dot(numpy.random.randn(input_dim, input_dim)) # kernels = [GPy.kern.linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)), GPy.kern.rbf(input_dim, ARD=True), GPy.kern.bias(input_dim)] - kernels = [GPy.kern.linear(input_dim), GPy.kern.rbf(input_dim), GPy.kern.bias(input_dim), - GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim), - GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim)] + kernels = [ + GPy.kern.linear(input_dim), + GPy.kern.rbf(input_dim), + #GPy.kern.bias(input_dim), + #GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim), + #GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim) + ] def testPsi0(self): for k in self.kernels: m = PsiStatModel('psi0', X=self.X, X_variance=self.X_var, Z=self.Z,\ num_inducing=self.num_inducing, kernel=k) - m.ensure_default_constraints() + #m.ensure_default_constraints(warning=0) m.randomize() - assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k.parts))) + import ipdb;ipdb.set_trace() + assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k._parameters_))) def testPsi1(self): for k in self.kernels: m = PsiStatModel('psi1', X=self.X, X_variance=self.X_var, Z=self.Z, num_inducing=self.num_inducing, kernel=k) - m.ensure_default_constraints() + m.ensure_default_constraints(warning=0) m.randomize() - assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts))) + assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k._parameters_))) def testPsi2_lin(self): k = self.kernels[0] m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z, num_inducing=self.num_inducing, kernel=k) - m.ensure_default_constraints() + m.ensure_default_constraints(warning=0) m.randomize() - assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts))) + assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_))) def testPsi2_lin_bia(self): k = self.kernels[3] m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z, num_inducing=self.num_inducing, kernel=k) - m.ensure_default_constraints() + m.ensure_default_constraints(warning=0) m.randomize() - assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts))) + assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_))) def testPsi2_rbf(self): k = self.kernels[1] m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z, num_inducing=self.num_inducing, kernel=k) - m.ensure_default_constraints() + m.ensure_default_constraints(warning=0) m.randomize() - assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts))) + assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_))) def testPsi2_rbf_bia(self): k = self.kernels[-1] m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z, num_inducing=self.num_inducing, kernel=k) - m.ensure_default_constraints() + m.ensure_default_constraints(warning=0) m.randomize() - assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts))) + assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_))) def testPsi2_bia(self): k = self.kernels[2] m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z, num_inducing=self.num_inducing, kernel=k) - m.ensure_default_constraints() + m.ensure_default_constraints(warning=0) m.randomize() - assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts))) + assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_))) if __name__ == "__main__":