diff --git a/GPy/core/gp.py b/GPy/core/gp.py index 62e16de1..8ce3482c 100644 --- a/GPy/core/gp.py +++ b/GPy/core/gp.py @@ -10,8 +10,13 @@ from model import Model from parameterization import ObsAr from .. import likelihoods from ..likelihoods.gaussian import Gaussian -from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation +from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation, LatentFunctionInference from parameterization.variational import VariationalPosterior +from scipy.sparse.base import issparse + +import logging +from GPy.util.normalizer import MeanNorm +logger = logging.getLogger("GP") class GP(Model): """ @@ -21,24 +26,46 @@ class GP(Model): :param Y: output observations :param kernel: a GPy kernel, defaults to rbf+white :param likelihood: a GPy likelihood + :param :class:`~GPy.inference.latent_function_inference.LatentFunctionInference` inference_method: The inference method to use for this GP :rtype: model object + :param Norm normalizer: + normalize the outputs Y. + Prediction will be un-normalized using this normalizer. + If normalizer is None, we will normalize using MeanNorm. + If normalizer is False, no normalization will be done. .. Note:: Multiple independent outputs are allowed using columns of Y """ - def __init__(self, X, Y, kernel, likelihood, inference_method=None, name='gp', Y_metadata=None): + def __init__(self, X, Y, kernel, likelihood, inference_method=None, name='gp', Y_metadata=None, normalizer=False): super(GP, self).__init__(name) assert X.ndim == 2 if isinstance(X, (ObsAr, VariationalPosterior)): - self.X = X + self.X = X.copy() else: self.X = ObsAr(X) self.num_data, self.input_dim = self.X.shape assert Y.ndim == 2 - self.Y = ObsAr(Y) + logger.info("initializing Y") + + if normalizer is None: + self.normalizer = MeanNorm() + elif normalizer is False: + self.normalizer = None + else: + self.normalizer = normalizer + + if self.normalizer is not None: + self.normalizer.scale_by(Y) + self.Y_normalized = ObsAr(self.normalizer.normalize(Y)) + self.Y = Y + else: + self.Y = ObsAr(Y) + self.Y_normalized = self.Y + assert Y.shape[0] == self.num_data _, self.output_dim = self.Y.shape @@ -53,6 +80,7 @@ class GP(Model): self.likelihood = likelihood #find a sensible inference method + logger.info("initializing inference method") if inference_method is None: if isinstance(likelihood, likelihoods.Gaussian) or isinstance(likelihood, likelihoods.MixedNoise): inference_method = exact_gaussian_inference.ExactGaussianInference() @@ -61,11 +89,12 @@ class GP(Model): print "defaulting to ", inference_method, "for latent function inference" self.inference_method = inference_method + logger.info("adding kernel and likelihood as parameters") self.add_parameter(self.kern) self.add_parameter(self.likelihood) def parameters_changed(self): - self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y, self.Y_metadata) + self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata) self.likelihood.update_gradients(self.grad_dict['dL_dthetaL']) self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X) @@ -130,6 +159,8 @@ class GP(Model): """ #predict the latent function values mu, var = self._raw_predict(Xnew, full_cov=full_cov, kern=kern) + if self.normalizer is not None: + mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var) # now push through likelihood mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata) @@ -137,8 +168,32 @@ class GP(Model): def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None): m, v = self._raw_predict(X, full_cov=False) + if self.normalizer is not None: + m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v) return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata) + def predictive_gradients(self, Xnew): + """ + Compute the derivatives of the latent function with respect to X* + + Given a set of points at which to predict X* (size [N*,Q]), compute the + derivatives of the mean and variance. Resulting arrays are sized: + dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one). + dv_dX* -- [N*, Q], (since all outputs have the same variance) + + """ + dmu_dX = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim)) + for i in range(self.output_dim): + dmu_dX[:,:,i] = self.kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self.X) + + # gradients wrt the diagonal part k_{xx} + dv_dX = self.kern.gradients_X(np.eye(Xnew.shape[0]), Xnew) + #grads wrt 'Schur' part K_{xf}K_{ff}^{-1}K_{fx} + alpha = -2.*np.dot(self.kern.K(Xnew, self.X),self.posterior.woodbury_inv) + dv_dX += self.kern.gradients_X(alpha, Xnew, self.X) + return dmu_dX, dv_dX + + def posterior_samples_f(self,X,size=10, full_cov=True): """ Samples the posterior GP at the points X. @@ -152,6 +207,8 @@ class GP(Model): :returns: Ysim: set of simulations, a Numpy array (N x samples). """ m, v = self._raw_predict(X, full_cov=full_cov) + if self.normalizer is not None: + m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v) v = v.reshape(m.size,-1) if len(v.shape)==3 else v if not full_cov: Ysim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T @@ -179,44 +236,105 @@ class GP(Model): return Ysim - def plot_f(self, *args, **kwargs): + def plot_f(self, plot_limits=None, which_data_rows='all', + which_data_ycols='all', fixed_inputs=[], + levels=20, samples=0, fignum=None, ax=None, resolution=None, + plot_raw=True, + linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'): """ - - Plot the GP's view of the world, where the data is normalized and - before applying a likelihood. - - This is a convenience function: arguments are passed to - GPy.plotting.matplot_dep.models_plots.plot_f_fit - + Plot the GP's view of the world, where the data is normalized and before applying a likelihood. + This is a call to plot with plot_raw=True. + Data will not be plotted in this, as the GP's view of the world + may live in another space, or units then the data. """ assert "matplotlib" in sys.modules, "matplotlib package has not been imported." from ..plotting.matplot_dep import models_plots - return models_plots.plot_fit_f(self,*args,**kwargs) + kw = {} + if linecol is not None: + kw['linecol'] = linecol + if fillcol is not None: + kw['fillcol'] = fillcol + return models_plots.plot_fit(self, plot_limits, which_data_rows, + which_data_ycols, fixed_inputs, + levels, samples, fignum, ax, resolution, + plot_raw=plot_raw, Y_metadata=Y_metadata, + data_symbol=data_symbol, **kw) - def plot(self, *args, **kwargs): + def plot(self, plot_limits=None, which_data_rows='all', + which_data_ycols='all', fixed_inputs=[], + levels=20, samples=0, fignum=None, ax=None, resolution=None, + plot_raw=False, + linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'): """ Plot the posterior of the GP. - - In one dimension, the function is plotted with a shaded region - identifying two standard deviations. - - In two dimsensions, a contour-plot shows the mean predicted - function - - In higher dimensions, use fixed_inputs to plot the GP with some of - the inputs fixed. + - In one dimension, the function is plotted with a shaded region identifying two standard deviations. + - In two dimsensions, a contour-plot shows the mean predicted function + - In higher dimensions, use fixed_inputs to plot the GP with some of the inputs fixed. Can plot only part of the data and part of the posterior functions - using which_data_rows which_data_ycols and which_parts - - This is a convenience function: arguments are passed to - GPy.plotting.matplot_dep.models_plots.plot_fit + using which_data_rowsm which_data_ycols. + :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits + :type plot_limits: np.array + :param which_data_rows: which of the training data to plot (default all) + :type which_data_rows: 'all' or a slice object to slice model.X, model.Y + :param which_data_ycols: when the data has several columns (independant outputs), only plot these + :type which_data_rows: 'all' or a list of integers + :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v. + :type fixed_inputs: a list of tuples + :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D + :type resolution: int + :param levels: number of levels to plot in a contour plot. + :type levels: int + :param samples: the number of a posteriori samples to plot + :type samples: int + :param fignum: figure to plot on. + :type fignum: figure number + :param ax: axes to plot on. + :type ax: axes handle + :type output: integer (first output is 0) + :param linecol: color of line to plot [Tango.colorsHex['darkBlue']] + :type linecol: + :param fillcol: color of fill [Tango.colorsHex['lightBlue']] + :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure """ assert "matplotlib" in sys.modules, "matplotlib package has not been imported." from ..plotting.matplot_dep import models_plots - return models_plots.plot_fit(self,*args,**kwargs) + kw = {} + if linecol is not None: + kw['linecol'] = linecol + if fillcol is not None: + kw['fillcol'] = fillcol + return models_plots.plot_fit(self, plot_limits, which_data_rows, + which_data_ycols, fixed_inputs, + levels, samples, fignum, ax, resolution, + plot_raw=plot_raw, Y_metadata=Y_metadata, + data_symbol=data_symbol, **kw) - def input_sensitivity(self): + def input_sensitivity(self, summarize=True): """ Returns the sensitivity for each dimension of this model """ - return self.kern.input_sensitivity() + return self.kern.input_sensitivity(summarize=summarize) + def optimize(self, optimizer=None, start=None, **kwargs): + """ + Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors. + kwargs are passed to the optimizer. They can be: + + :param max_f_eval: maximum number of function evaluations + :type max_f_eval: int + :messages: whether to display during optimisation + :type messages: bool + :param optimizer: which optimizer to use (defaults to self.preferred optimizer) + :type optimizer: string + + TODO: valid args + """ + self.inference_method.on_optimization_start() + try: + super(GP, self).optimize(optimizer, start, **kwargs) + except KeyboardInterrupt: + print "KeyboardInterrupt caught, calling on_optimization_end() to round things up" + self.inference_method.on_optimization_end() + raise diff --git a/GPy/core/model.py b/GPy/core/model.py index 38e8d4cf..00c6d5ff 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -20,7 +20,11 @@ class Model(Parameterized): super(Model, self).__init__(name) # Parameterized.__init__(self) self.optimization_runs = [] self.sampling_runs = [] - self.preferred_optimizer = 'scg' + self.preferred_optimizer = 'bfgs' + from .parameterization.ties_and_remappings import Tie + self.tie = Tie() + self.add_parameter(self.tie, -1) + self.add_observer(self.tie, self.tie._parameters_changed_notification, priority=-500) def log_likelihood(self): raise NotImplementedError, "this needs to be implemented to use the model class" @@ -61,7 +65,7 @@ class Model(Parameterized): on the current machine. """ - initial_parameters = self._get_params_transformed() + initial_parameters = self.optimizer_array.copy() if parallel: try: @@ -97,9 +101,9 @@ class Model(Parameterized): if len(self.optimization_runs): i = np.argmin([o.f_opt for o in self.optimization_runs]) - self._set_params_transformed(self.optimization_runs[i].x_opt) + self.optimizer_array = self.optimization_runs[i].x_opt else: - self._set_params_transformed(initial_parameters) + self.optimizer_array = initial_parameters def ensure_default_constraints(self, warning=True): """ @@ -118,30 +122,32 @@ class Model(Parameterized): """ The objective function for the given algorithm. - This function is the true objective, which wants to be minimized. - Note that all parameters are already set and in place, so you just need + This function is the true objective, which wants to be minimized. + Note that all parameters are already set and in place, so you just need to return the objective function here. For probabilistic models this is the negative log_likelihood - (including the MAP prior), so we return it here. If your model is not - probabilistic, just return your objective here! + (including the MAP prior), so we return it here. If your model is not + probabilistic, just return your objective to minimize here! """ return -float(self.log_likelihood()) - self.log_prior() def objective_function_gradients(self): """ The gradients for the objective function for the given algorithm. + The gradients are w.r.t. the *negative* objective function, as + this framework works with *negative* log-likelihoods as a default. You can find the gradient for the parameters in self.gradient at all times. This is the place, where gradients get stored for parameters. - This function is the true objective, which wants to be minimized. - Note that all parameters are already set and in place, so you just need + This function is the true objective, which wants to be minimized. + Note that all parameters are already set and in place, so you just need to return the gradient here. For probabilistic models this is the gradient of the negative log_likelihood - (including the MAP prior), so we return it here. If your model is not - probabilistic, just return your gradient here! + (including the MAP prior), so we return it here. If your model is not + probabilistic, just return your *negative* gradient here! """ return -(self._log_likelihood_gradients() + self._log_prior_gradients()) @@ -157,7 +163,8 @@ class Model(Parameterized): :type x: np.array """ try: - self._set_params_transformed(x) + # self._set_params_transformed(x) + self.optimizer_array = x obj_grads = self._transform_gradients(self.objective_function_gradients()) self._fail_count = 0 except (LinAlgError, ZeroDivisionError, ValueError): @@ -180,7 +187,7 @@ class Model(Parameterized): :parameter type: np.array """ try: - self._set_params_transformed(x) + self.optimizer_array = x obj = self.objective_function() self._fail_count = 0 except (LinAlgError, ZeroDivisionError, ValueError): @@ -192,7 +199,7 @@ class Model(Parameterized): def _objective_grads(self, x): try: - self._set_params_transformed(x) + self.optimizer_array = x obj_f, obj_grads = self.objective_function(), self._transform_gradients(self.objective_function_gradients()) self._fail_count = 0 except (LinAlgError, ZeroDivisionError, ValueError): @@ -220,22 +227,26 @@ class Model(Parameterized): if self.is_fixed: raise RuntimeError, "Cannot optimize, when everything is fixed" if self.size == 0: - raise RuntimeError, "Model without parameters cannot be minimized" + raise RuntimeError, "Model without parameters cannot be optimized" + + if start == None: + start = self.optimizer_array if optimizer is None: optimizer = self.preferred_optimizer - if start == None: - start = self._get_params_transformed() - - optimizer = optimization.get_optimizer(optimizer) - opt = optimizer(start, model=self, **kwargs) + if isinstance(optimizer, optimization.Optimizer): + opt = optimizer + opt.model = self + else: + optimizer = optimization.get_optimizer(optimizer) + opt = optimizer(start, model=self, **kwargs) opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads) self.optimization_runs.append(opt) - self._set_params_transformed(opt.x_opt) + self.optimizer_array = opt.x_opt def optimize_SGD(self, momentum=0.1, learning_rate=0.01, iterations=20, **kwargs): # assert self.Y.shape[1] > 1, "SGD only works with D > 1" @@ -246,7 +257,7 @@ class Model(Parameterized): def _checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3): """ Check the gradient of the ,odel by comparing to a numerical - estimate. If the verbose flag is passed, invividual + estimate. If the verbose flag is passed, individual components are tested (and printed) :param verbose: If True, print a "full" checking of each parameter @@ -260,7 +271,7 @@ class Model(Parameterized): The gradient is considered correct if the ratio of the analytical and numerical gradients is within of unity. """ - x = self._get_params_transformed().copy() + x = self.optimizer_array.copy() if not verbose: # make sure only to test the selected parameters @@ -270,8 +281,8 @@ class Model(Parameterized): transformed_index = self._raveled_index_for(target_param) if self._has_fixes(): indices = np.r_[:self.size] - which = (transformed_index[:,None]==indices[self._fixes_][None,:]).nonzero() - transformed_index = (indices-(~self._fixes_).cumsum())[transformed_index[which[0]]] + which = (transformed_index[:, None] == indices[self._fixes_][None, :]).nonzero() + transformed_index = (indices - (~self._fixes_).cumsum())[transformed_index[which[0]]] if transformed_index.size == 0: print "No free parameters to check" @@ -290,7 +301,7 @@ class Model(Parameterized): gradient = gradient[transformed_index] denominator = (2 * np.dot(dx, gradient)) - global_ratio = (f1 - f2) / np.where(denominator==0., 1e-32, denominator) + global_ratio = (f1 - f2) / np.where(denominator == 0., 1e-32, denominator) global_diff = np.abs(f1 - f2) < tolerance and np.allclose(gradient, 0, atol=tolerance) if global_ratio is np.nan: global_ratio = 0 @@ -319,10 +330,10 @@ class Model(Parameterized): param_index = self._raveled_index_for(target_param) if self._has_fixes(): indices = np.r_[:self.size] - which = (param_index[:,None]==indices[self._fixes_][None,:]).nonzero() + which = (param_index[:, None] == indices[self._fixes_][None, :]).nonzero() param_index = param_index[which[0]] - transformed_index = (indices-(~self._fixes_).cumsum())[param_index] - #print param_index, transformed_index + transformed_index = (indices - (~self._fixes_).cumsum())[param_index] + # print param_index, transformed_index else: transformed_index = param_index @@ -340,9 +351,9 @@ class Model(Parameterized): xx[xind] -= 2.*step f2 = self._objective(xx) numerical_gradient = (f1 - f2) / (2 * step) - if np.all(gradient[xind]==0): ratio = (f1-f2) == gradient[xind] + if np.all(gradient[xind] == 0): ratio = (f1 - f2) == gradient[xind] else: ratio = (f1 - f2) / (2 * step * gradient[xind]) - difference = np.abs((f1 - f2) / 2 / step - gradient[xind]) + difference = np.abs(numerical_gradient - gradient[xind]) if (np.abs(1. - ratio) < tolerance) or np.abs(difference) < tolerance: formatted_name = "\033[92m {0} \033[0m".format(names[nind]) @@ -358,7 +369,7 @@ class Model(Parameterized): grad_string = "{0:<{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}".format(formatted_name, r, d, g, ng, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4]) print grad_string - self._set_params_transformed(x) + self.optimizer_array = x return ret diff --git a/GPy/core/parameterization/index_operations.py b/GPy/core/parameterization/index_operations.py index 57a18baf..78c0d2b9 100644 --- a/GPy/core/parameterization/index_operations.py +++ b/GPy/core/parameterization/index_operations.py @@ -1,47 +1,64 @@ -# Copyright (c) 2014, Max Zwiessele -# Licensed under the BSD 3-clause license (see LICENSE.txt) +''' +Created on Oct 2, 2013 + +@author: maxzwiessele +''' import numpy from numpy.lib.function_base import vectorize from lists_and_dicts import IntArrayDict +def extract_properties_to_index(index, props): + prop_index = dict() + for i, cl in enumerate(props): + for c in cl: + ind = prop_index.get(c, list()) + ind.append(index[i]) + prop_index[c] = ind + + for c, i in prop_index.items(): + prop_index[c] = numpy.array(i, dtype=int) + + return prop_index + + class ParameterIndexOperations(object): """ This object wraps a dictionary, whos keys are _operations_ that we'd like to apply to a parameter array, and whose values are np integer arrays which index the parameter array appropriately. - + A model instance will contain one instance of this class for each thing that needs indexing (i.e. constraints, ties and priors). Parameters within the model constain instances of the ParameterIndexOperationsView class, which can map from a 'local' index (starting 0) to this global index. - + Here's an illustration: - + #======================================================================= - model : 0 1 2 3 4 5 6 7 8 9 - key1: 4 5 - key2: 7 8 - - param1: 0 1 2 3 4 5 - key1: 2 3 - key2: 5 - - param2: 0 1 2 3 4 - key1: 0 - key2: 2 3 + model : 0 1 2 3 4 5 6 7 8 9 + key1: 4 5 + key2: 7 8 + + param1: 0 1 2 3 4 5 + key1: 2 3 + key2: 5 + + param2: 0 1 2 3 4 + key1: 0 + key2: 2 3 #======================================================================= - + The views of this global index have a subset of the keys in this global (model) index. - + Adding a new key (e.g. a constraint) to a view will cause the view to pass the new key to the global index, along with the local index and an offset. This global index then stores the key and the appropriate global index (which can be seen by the view). - + See also: - ParameterIndexOperationsView - + ParameterIndexOperationsView + """ _offset = 0 def __init__(self, constraints=None): @@ -92,8 +109,34 @@ class ParameterIndexOperations(object): return self._properties.values() def properties_for(self, index): + """ + Returns a list of properties, such that each entry in the list corresponds + to the element of the index given. + + Example: + let properties: 'one':[1,2,3,4], 'two':[3,5,6] + + >>> properties_for([2,3,5]) + [['one'], ['one', 'two'], ['two']] + """ return vectorize(lambda i: [prop for prop in self.iterproperties() if i in self[prop]], otypes=[list])(index) + def properties_to_index_dict(self, index): + """ + Return a dictionary, containing properties as keys and indices as index + Thus, the indices for each constraint, which is contained will be collected as + one dictionary + + Example: + let properties: 'one':[1,2,3,4], 'two':[3,5,6] + + >>> properties_to_index_dict([2,3,5]) + {'one':[2,3], 'two':[3,5]} + """ + props = self.properties_for(index) + prop_index = extract_properties_to_index(index, props) + return prop_index + def add(self, prop, indices): self._properties[prop] = combine_indices(self._properties[prop], indices) @@ -200,8 +243,32 @@ class ParameterIndexOperationsView(object): def properties_for(self, index): + """ + Returns a list of properties, such that each entry in the list corresponds + to the element of the index given. + + Example: + let properties: 'one':[1,2,3,4], 'two':[3,5,6] + + >>> properties_for([2,3,5]) + [['one'], ['one', 'two'], ['two']] + """ return vectorize(lambda i: [prop for prop in self.iterproperties() if i in self[prop]], otypes=[list])(index) + def properties_to_index_dict(self, index): + """ + Return a dictionary, containing properties as keys and indices as index + Thus, the indices for each constraint, which is contained will be collected as + one dictionary + + Example: + let properties: 'one':[1,2,3,4], 'two':[3,5,6] + + >>> properties_to_index_dict([2,3,5]) + {'one':[2,3], 'two':[3,5]} + """ + return extract_properties_to_index(index, self.properties_for(index)) + def add(self, prop, indices): self._param_index_ops.add(prop, indices+self._offset) diff --git a/GPy/core/parameterization/lists_and_dicts.py b/GPy/core/parameterization/lists_and_dicts.py index 604d0a01..0343909e 100644 --- a/GPy/core/parameterization/lists_and_dicts.py +++ b/GPy/core/parameterization/lists_and_dicts.py @@ -38,7 +38,12 @@ class ArrayList(list): raise ValueError, "{} is not in list".format(item) pass -class ObservablesList(object): +class ObserverList(object): + """ + A list which containts the observables. + It only holds weak references to observers, such that unbound + observers dont dangle in memory. + """ def __init__(self): self._poc = [] @@ -46,29 +51,44 @@ class ObservablesList(object): p,o,c = self._poc[ind] return p, o(), c - def remove(self, priority, observable, callble): + def remove(self, priority, observer, callble): """ + Remove one observer, which had priority and callble. """ self.flush() for i in range(len(self) - 1, -1, -1): p,o,c = self[i] - if priority==p and observable==o and callble==c: + if priority==p and observer==o and callble==c: del self._poc[i] def __repr__(self): return self._poc.__repr__() - def add(self, priority, observable, callble): - ins = 0 - for pr, _, _ in self: - if priority > pr: - break - ins += 1 - self._poc.insert(ins, (priority, weakref.ref(observable), callble)) - + def add(self, priority, observer, callble): + """ + Add an observer with priority and callble + """ + if observer is not None: + ins = 0 + for pr, _, _ in self: + if priority > pr: + break + ins += 1 + self._poc.insert(ins, (priority, weakref.ref(observer), callble)) + def __str__(self): + from . import ObsAr, Param + from parameter_core import Parameterizable ret = [] curr_p = None + + def frmt(o): + if isinstance(o, ObsAr): + return 'ObsArr <{}>'.format(hex(id(o))) + elif isinstance(o, (Param,Parameterizable)): + return '{}'.format(o.hierarchy_name()) + else: + return repr(o) for p, o, c in self: curr = '' if curr_p != p: @@ -77,27 +97,31 @@ class ObservablesList(object): else: curr_pre = " "*len(pre) curr_p = p curr += curr_pre - ret.append(curr + ", ".join(map(repr, [o,c]))) - return '\n'.join(ret) + + ret.append(curr + ", ".join([frmt(o), str(c)])) + return '\n'.join(ret) def flush(self): + """ + Make sure all weak references, which point to nothing are flushed (deleted) + """ self._poc = [(p,o,c) for p,o,c in self._poc if o() is not None] def __iter__(self): self.flush() for p, o, c in self._poc: - if o() is not None: - yield p, o(), c + yield p, o(), c def __len__(self): self.flush() return self._poc.__len__() def __deepcopy__(self, memo): - self.flush() - s = ObservablesList() - import copy - s._poc = copy.deepcopy(self._poc, memo) + s = ObserverList() + for p,o,c in self: + import copy + s.add(p, copy.deepcopy(o, memo), copy.deepcopy(c, memo)) + s.flush() return s def __getstate__(self): diff --git a/GPy/core/parameterization/observable_array.py b/GPy/core/parameterization/observable_array.py index a280d74f..09450b08 100644 --- a/GPy/core/parameterization/observable_array.py +++ b/GPy/core/parameterization/observable_array.py @@ -1,7 +1,7 @@ # Copyright (c) 2012, GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) -__updated__ = '2014-04-15' +__updated__ = '2014-05-12' import numpy as np from parameter_core import Observable, Pickleable @@ -15,10 +15,10 @@ class ObsAr(np.ndarray, Pickleable, Observable): """ __array_priority__ = -1 # Never give back ObsAr def __new__(cls, input_array, *a, **kw): + # allways make a copy of input paramters, as we need it to be in C order: if not isinstance(input_array, ObsAr): - obj = np.atleast_1d(np.require(input_array, dtype=np.float64, requirements=['W', 'C'])).view(cls) + obj = np.atleast_1d(np.require(np.copy(input_array), dtype=np.float64, requirements=['W', 'C'])).view(cls) else: obj = input_array - #cls.__name__ = "ObsAr" # because of fixed printing of `array` in np printing super(ObsAr, obj).__init__(*a, **kw) return obj @@ -30,16 +30,22 @@ class ObsAr(np.ndarray, Pickleable, Observable): def __array_wrap__(self, out_arr, context=None): return out_arr.view(np.ndarray) + def _setup_observers(self): + # do not setup anything, as observable arrays do not have default observers + pass + def copy(self): + from lists_and_dicts import ObserverList memo = {} memo[id(self)] = self + memo[id(self.observers)] = ObserverList() return self.__deepcopy__(memo) def __deepcopy__(self, memo): s = self.__new__(self.__class__, input_array=self.view(np.ndarray).copy()) memo[id(self)] = s import copy - s.__dict__.update(copy.deepcopy(self.__dict__, memo)) + Pickleable.__setstate__(s, copy.deepcopy(self.__getstate__(), memo)) return s def __reduce__(self): diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index 87854c9b..a6069a00 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -4,7 +4,7 @@ import itertools import numpy np = numpy -from parameter_core import OptimizationHandlable, adjust_name_for_printing +from parameter_core import Parameterizable, adjust_name_for_printing, Pickleable from observable_array import ObsAr ###### printing @@ -16,7 +16,7 @@ __precision__ = numpy.get_printoptions()['precision'] # numpy printing precision __print_threshold__ = 5 ###### -class Param(OptimizationHandlable, ObsAr): +class Param(Parameterizable, ObsAr): """ Parameter object for GPy models. @@ -42,10 +42,9 @@ class Param(OptimizationHandlable, ObsAr): """ __array_priority__ = -1 # Never give back Param _fixes_ = None - _parameters_ = [] + parameters = [] def __new__(cls, name, input_array, default_constraint=None): obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array)) - cls.__name__ = "Param" obj._current_slice_ = (slice(obj.shape[0]),) obj._realshape_ = obj.shape obj._realsize_ = obj.size @@ -58,9 +57,9 @@ class Param(OptimizationHandlable, ObsAr): def build_pydot(self,G): import pydot - node = pydot.Node(id(self), shape='record', label=self.name) + node = pydot.Node(id(self), shape='trapezium', label=self.name)#, fontcolor='white', color='white') G.add_node(node) - for o in self.observers.keys(): + for _, o, _ in self.observers: label = o.name if hasattr(o, 'name') else str(o) observed_node = pydot.Node(id(o), label=label) G.add_node(observed_node) @@ -88,8 +87,18 @@ class Param(OptimizationHandlable, ObsAr): @property def param_array(self): + """ + As we are a leaf, this just returns self + """ return self + @property + def values(self): + """ + Return self as numpy array view + """ + return self.view(np.ndarray) + @property def gradient(self): """ @@ -100,11 +109,11 @@ class Param(OptimizationHandlable, ObsAr): """ if getattr(self, '_gradient_array_', None) is None: self._gradient_array_ = numpy.empty(self._realshape_, dtype=numpy.float64) - return self._gradient_array_[self._current_slice_] + return self._gradient_array_#[self._current_slice_] @gradient.setter def gradient(self, val): - self._gradient_array_[self._current_slice_] = val + self._gradient_array_[:] = val #=========================================================================== # Array operations -> done @@ -112,10 +121,13 @@ class Param(OptimizationHandlable, ObsAr): def __getitem__(self, s, *args, **kwargs): if not isinstance(s, tuple): s = (s,) - if not reduce(lambda a, b: a or numpy.any(b is Ellipsis), s, False) and len(s) <= self.ndim: - s += (Ellipsis,) + #if not reduce(lambda a, b: a or numpy.any(b is Ellipsis), s, False) and len(s) <= self.ndim: + # s += (Ellipsis,) new_arr = super(Param, self).__getitem__(s, *args, **kwargs) - try: new_arr._current_slice_ = s; new_arr._original_ = self.base is new_arr.base + try: + new_arr._current_slice_ = s + new_arr._gradient_array_ = self.gradient[s] + new_arr._original_ = self.base is new_arr.base except AttributeError: pass # returning 0d array or float, double etc return new_arr @@ -130,6 +142,9 @@ class Param(OptimizationHandlable, ObsAr): def _raveled_index_for(self, obj): return self._raveled_index() + #=========================================================================== + # Index recreation + #=========================================================================== def _expand_index(self, slice_index=None): # this calculates the full indexing arrays from the slicing objects given by get_item for _real..._ attributes # it basically translates slices to their respective index arrays and turns negative indices around @@ -138,6 +153,8 @@ class Param(OptimizationHandlable, ObsAr): slice_index = self._current_slice_ def f(a): a, b = a + if isinstance(a, numpy.ndarray) and a.dtype == bool: + raise ValueError, "Boolean indexing not implemented, use Param[np.where(index)] to index by boolean arrays!" if a not in (slice(None), Ellipsis): if isinstance(a, slice): start, stop, step = a.indices(b) @@ -170,14 +187,24 @@ class Param(OptimizationHandlable, ObsAr): #=========================================================================== # Pickling and copying #=========================================================================== + def copy(self): + return Parameterizable.copy(self, which=self) + def __deepcopy__(self, memo): s = self.__new__(self.__class__, name=self.name, input_array=self.view(numpy.ndarray).copy()) - memo[id(self)] = s + memo[id(self)] = s import copy - s.__dict__.update(copy.deepcopy(self.__dict__, memo)) + Pickleable.__setstate__(s, copy.deepcopy(self.__getstate__(), memo)) return s - - + def _setup_observers(self): + """ + Setup the default observers + + 1: pass through to parent, if present + """ + if self.has_parent(): + self.add_observer(self._parent_, self._parent_._pass_through_notify_observers, -np.inf) + #=========================================================================== # Printing -> done #=========================================================================== @@ -228,9 +255,16 @@ class Param(OptimizationHandlable, ObsAr): and len(set(map(len, clean_curr_slice))) <= 1): return numpy.fromiter(itertools.izip(*clean_curr_slice), dtype=[('', int)] * self._realndim_, count=len(clean_curr_slice[0])).view((int, self._realndim_)) - expanded_index = list(self._expand_index(slice_index)) - return numpy.fromiter(itertools.product(*expanded_index), + try: + expanded_index = list(self._expand_index(slice_index)) + indices = numpy.fromiter(itertools.product(*expanded_index), dtype=[('', int)] * self._realndim_, count=reduce(lambda a, b: a * b.size, expanded_index, 1)).view((int, self._realndim_)) + except: + print "Warning: extended indexing was used" + indices = np.indices(self._realshape_, dtype=int) + indices = indices[(slice(None),)+slice_index] + indices = np.rollaxis(indices, 0, indices.ndim) + return indices def _max_len_names(self, gen, header): gen = map(lambda x: " ".join(map(str, x)), gen) return reduce(lambda a, b:max(a, len(b)), gen, len(header)) @@ -272,7 +306,7 @@ class Param(OptimizationHandlable, ObsAr): class ParamConcatenation(object): def __init__(self, params): """ - Parameter concatenation for convienience of printing regular expression matched arrays + Parameter concatenation for convenience of printing regular expression matched arrays you can index this concatenation as if it was the flattened concatenation of all the parameters it contains, same for setting parameters (Broadcasting enabled). @@ -316,8 +350,8 @@ class ParamConcatenation(object): val = val.values() ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True; vals = self.values(); vals[s] = val - [numpy.copyto(p, vals[ps], where=ind[ps]) - for p, ps in zip(self.params, self._param_slices_)] + for p, ps in zip(self.params, self._param_slices_): + p.flat[ind[ps]] = vals[ps] if update: self.update_all_params() def values(self): diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py index 2c85b542..815f069b 100644 --- a/GPy/core/parameterization/parameter_core.py +++ b/GPy/core/parameterization/parameter_core.py @@ -1,4 +1,4 @@ -#t Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) """ Core module for parameterization. @@ -16,8 +16,9 @@ Observable Pattern for patameterization from transformations import Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED import numpy as np import re +import logging -__updated__ = '2014-04-16' +__updated__ = '2014-05-21' class HierarchyError(Exception): """ @@ -49,21 +50,49 @@ class Observable(object): as an observer. Every time the observable changes, it sends a notification with self as only argument to all its observers. """ - _updated = True + _updates = True def __init__(self, *args, **kwargs): super(Observable, self).__init__() - from lists_and_dicts import ObservablesList - self.observers = ObservablesList() + from lists_and_dicts import ObserverList + self.observers = ObserverList() + + @property + def updates(self): + p = getattr(self, '_highest_parent_', None) + if p is not None: + self._updates = p._updates + return self._updates + + @updates.setter + def updates(self, ups): + assert isinstance(ups, bool), "updates are either on (True) or off (False)" + p = getattr(self, '_highest_parent_', None) + if p is not None: + p._updates = ups + else: + self._updates = ups + if ups: + self._trigger_params_changed() def add_observer(self, observer, callble, priority=0): + """ + Add an observer `observer` with the callback `callble` + and priority `priority` to this observers list. + """ self.observers.add(priority, observer, callble) def remove_observer(self, observer, callble=None): + """ + Either (if callble is None) remove all callables, + which were added alongside observer, + or remove callable `callble` which was added alongside + the observer `observer`. + """ to_remove = [] for poc in self.observers: _, obs, clble = poc if callble is not None: - if (obs == observer) and (callble == clble): + if (obs is observer) and (callble == clble): to_remove.append(poc) else: if obs is observer: @@ -81,6 +110,8 @@ class Observable(object): :param min_priority: only notify observers with priority > min_priority if min_priority is None, notify all observers in order """ + if not self.updates: + return if which is None: which = self if min_priority is None: @@ -91,10 +122,6 @@ class Observable(object): break callble(self, which=which) -#=============================================================================== -# Foundation framework for parameterized and param objects: -#=============================================================================== - class Parentable(object): """ Enable an Object to have a parent. @@ -124,7 +151,7 @@ class Parentable(object): """ Disconnect this object from its parent """ - raise NotImplementedError, "Abstaract superclass" + raise NotImplementedError, "Abstract superclass" @property def _highest_parent_(self): @@ -151,6 +178,7 @@ class Pickleable(object): """ def __init__(self, *a, **kw): super(Pickleable, self).__init__() + #=========================================================================== # Pickling operations #=========================================================================== @@ -162,61 +190,71 @@ class Pickleable(object): :param protocol: pickling protocol to use, python-pickle for details. """ import cPickle as pickle - import pickle #TODO: cPickle if isinstance(f, str): - with open(f, 'w') as f: + with open(f, 'wb') as f: pickle.dump(self, f, protocol) else: pickle.dump(self, f, protocol) - #=========================================================================== + #=========================================================================== # copy and pickling #=========================================================================== - def copy(self): - """Returns a (deep) copy of the current model""" + def copy(self, memo=None, which=None): + """ + Returns a (deep) copy of the current parameter handle. + + All connections to parents of the copy will be cut. + + :param dict memo: memo for deepcopy + :param Parameterized which: parameterized object which started the copy process [default: self] + """ #raise NotImplementedError, "Copy is not yet implemented, TODO: Observable hierarchy" + if memo is None: + memo = {} import copy - memo = {} - memo[id(self._parent_)] = None - memo[id(self.gradient)] = None - memo[id(self.param_array)] = None - memo[id(self._fixes_)] = None - c = copy.deepcopy(self, memo) - c._parent_index_ = None - return c + # the next part makes sure that we do not include parents in any form: + parents = [] + if which is None: + which = self + which.traverse_parents(parents.append) # collect parents + for p in parents: + if not memo.has_key(id(p)):memo[id(p)] = None # set all parents to be None, so they will not be copied + if not memo.has_key(id(self.gradient)):memo[id(self.gradient)] = None # reset the gradient + if not memo.has_key(id(self._fixes_)):memo[id(self._fixes_)] = None # fixes have to be reset, as this is now highest parent + copy = copy.deepcopy(self, memo) # and start the copy + copy._parent_index_ = None + copy._trigger_params_changed() + return copy def __deepcopy__(self, memo): - s = self.__new__(self.__class__) - memo[id(self)] = s + s = self.__new__(self.__class__) # fresh instance + memo[id(self)] = s # be sure to break all cycles --> self is already done import copy - s.__dict__.update(copy.deepcopy(self.__dict__, memo)) + s.__setstate__(copy.deepcopy(self.__getstate__(), memo)) # standard copy return s def __getstate__(self): - ignore_list = ([#'_parent_', '_parent_index_', - #'observers', - '_param_array_', '_gradient_array_', '_fixes_', - '_Cacher_wrap__cachers'] - #+ self.parameter_names(recursive=False) - ) + ignore_list = ['_param_array_', # parameters get set from bottom to top + '_gradient_array_', # as well as gradients + '_optimizer_copy_', + 'logger', + 'observers', + '_fixes_', # and fixes + '_Cacher_wrap__cachers', # never pickle cachers + ] dc = dict() for k,v in self.__dict__.iteritems(): if k not in ignore_list: - #if hasattr(v, "__getstate__"): - #dc[k] = v.__getstate__() - #else: dc[k] = v return dc - + def __setstate__(self, state): self.__dict__.update(state) - return self + from lists_and_dicts import ObserverList + self.observers = ObserverList() + self._setup_observers() + self._optimizer_copy_transformed = False - #def __getstate__(self, memo): - # raise NotImplementedError, "get state must be implemented to be able to pickle objects" - - #def __setstate__(self, memo): - # raise NotImplementedError, "set state must be implemented to be able to pickle objects" class Gradcheckable(Pickleable, Parentable): """ @@ -243,7 +281,7 @@ class Gradcheckable(Pickleable, Parentable): """ if self.has_parent(): return self._highest_parent_._checkgrad(self, verbose=verbose, step=step, tolerance=tolerance) - return self._checkgrad(self[''], verbose=verbose, step=step, tolerance=tolerance) + return self._checkgrad(self, verbose=verbose, step=step, tolerance=tolerance) def _checkgrad(self, param, verbose=0, step=1e-6, tolerance=1e-3): """ @@ -252,7 +290,6 @@ class Gradcheckable(Pickleable, Parentable): """ raise HierarchyError, "This parameter is not in a model with a likelihood, and, therefore, cannot be gradient checked!" - class Nameable(Gradcheckable): """ Make an object nameable inside the hierarchy. @@ -291,41 +328,8 @@ class Nameable(Gradcheckable): return self._parent_.hierarchy_name() + "." + adjust(self.name) return adjust(self.name) -class Indexable(object): - """ - Enable enraveled indexes and offsets for this object. - The raveled index of an object is the index for its parameters in a flattened int array. - """ - def __init__(self, *a, **kw): - super(Indexable, self).__init__() - def _raveled_index(self): - """ - Flattened array of ints, specifying the index of this object. - This has to account for shaped parameters! - """ - raise NotImplementedError, "Need to be able to get the raveled Index" - - def _offset_for(self, param): - """ - Return the offset of the param inside this parameterized object. - This does not need to account for shaped parameters, as it - basically just sums up the parameter sizes which come before param. - """ - return 0 - #raise NotImplementedError, "shouldnt happen, offset required from non parameterization object?" - - def _raveled_index_for(self, param): - """ - get the raveled index for a param - that is an int array, containing the indexes for the flattened - param inside this parameterized logic. - """ - return param._raveled_index() - #raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?" - - -class Constrainable(Nameable, Indexable, Observable): +class Indexable(Nameable, Observable): """ Make an object constrainable with Priors and Transformations. TODO: Mappings!! @@ -336,7 +340,7 @@ class Constrainable(Nameable, Indexable, Observable): :func:`constrain()` and :func:`unconstrain()` are main methods here """ def __init__(self, name, default_constraint=None, *a, **kw): - super(Constrainable, self).__init__(name=name, *a, **kw) + super(Indexable, self).__init__(name=name, *a, **kw) self._default_constraint_ = default_constraint from index_operations import ParameterIndexOperations self.constraints = ParameterIndexOperations() @@ -358,6 +362,40 @@ class Constrainable(Nameable, Indexable, Observable): self._connect_fixes() self._notify_parent_change() + #=========================================================================== + # Indexable + #=========================================================================== + def _offset_for(self, param): + """ + Return the offset of the param inside this parameterized object. + This does not need to account for shaped parameters, as it + basically just sums up the parameter sizes which come before param. + """ + if param.has_parent(): + p = param._parent_._get_original(param) + if p in self.parameters: + return reduce(lambda a,b: a + b.size, self.parameters[:p._parent_index_], 0) + return self._offset_for(param._parent_) + param._parent_._offset_for(param) + return 0 + + def _raveled_index_for(self, param): + """ + get the raveled index for a param + that is an int array, containing the indexes for the flattened + param inside this parameterized logic. + """ + from param import ParamConcatenation + if isinstance(param, ParamConcatenation): + return np.hstack((self._raveled_index_for(p) for p in param.params)) + return param._raveled_index() + self._offset_for(param) + + def _raveled_index(self): + """ + Flattened array of ints, specifying the index of this object. + This has to account for shaped parameters! + """ + return np.r_[:self.size] + #=========================================================================== # Fixing Parameters: #=========================================================================== @@ -369,8 +407,9 @@ class Constrainable(Nameable, Indexable, Observable): """ if value is not None: self[:] = value - reconstrained = self.unconstrain() - index = self._add_to_index_operations(self.constraints, reconstrained, __fixed__, warning) + + index = self.unconstrain() + index = self._add_to_index_operations(self.constraints, index, __fixed__, warning) self._highest_parent_._set_fixed(self, index) self.notify_observers(self, None if trigger_parent else -np.inf) return index @@ -412,9 +451,24 @@ class Constrainable(Nameable, Indexable, Observable): self._fixes_ = None del self.constraints[__fixed__] + #=========================================================================== + # Convenience for fixed + #=========================================================================== def _has_fixes(self): return hasattr(self, "_fixes_") and self._fixes_ is not None and self._fixes_.size == self.size + @property + def is_fixed(self): + for p in self.parameters: + if not p.is_fixed: return False + return True + + def _get_original(self, param): + # if advanced indexing is activated it happens that the array is a copy + # you can retrieve the original param through this method, by passing + # the copy here + return self.parameters[param._parent_index_] + #=========================================================================== # Prior Operations #=========================================================================== @@ -438,8 +492,7 @@ class Constrainable(Nameable, Indexable, Observable): def unset_priors(self, *priors): """ - Un-set all priors given from this parameter handle. - + Un-set all priors given (in *priors) from this parameter handle. """ return self._remove_from_index_operations(self.priors, priors) @@ -458,6 +511,22 @@ class Constrainable(Nameable, Indexable, Observable): [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.iteritems()] return ret return 0. + + #=========================================================================== + # Tie parameters together + #=========================================================================== + + def _has_ties(self): + if self._highest_parent_.tie.tied_param is None: + return False + if self.has_parent(): + return self._highest_parent_.tie.label_buf[self._highest_parent_._raveled_index_for(self)].sum()>0 + return True + + def tie_together(self): + self._highest_parent_.tie.add_tied_parameter(self) + self._highest_parent_._set_fixed(self,self._raveled_index()) + self._trigger_params_changed() #=========================================================================== # Constrain operations -> done @@ -471,8 +540,6 @@ class Constrainable(Nameable, Indexable, Observable): Constrain the parameter to the given :py:class:`GPy.core.transformations.Transformation`. - - :returns added: the indices that were constrained """ self.param_array[...] = transform.initialize(self.param_array) reconstrained = self.unconstrain() @@ -480,37 +547,6 @@ class Constrainable(Nameable, Indexable, Observable): self.notify_observers(self, None if trigger_parent else -np.inf) return added - def tie(self, name): - #remove any constraints - old_const = self.constraints.properties()[:] - self.unconstrain() - - #set these parameters to be 'fixed' as in, not optimized - self._highest_parent_._set_fixed(self, self._raveled_index()) - - #see if a tie exists with that name - if name in self._highest_parent_.ties: - t = self._highest_parent_.ties[name] - else: - #create a tie object - value = np.atleast_1d(self.param_array)[0]*1 - import ties_and_remappings - t = ties_and_remappings.Tie(value=value, name=name) - - #add the new tie object to the global index - self._highest_parent_.ties[name] = t - self._highest_parent_.add_parameter(t) - - #constrain the tie as we were constrained - if len(old_const)==1: - t.constrain(old_const[0]) - - - self.constraints.add(t, self._raveled_index()) - t.add_tied_parameter(self) - - - def unconstrain(self, *transforms): """ :param transforms: The transformations to unconstrain from. @@ -574,13 +610,13 @@ class Constrainable(Nameable, Indexable, Observable): self.constraints = ParameterIndexOperationsView(parent.constraints, parent._offset_for(self), self.size) self.priors = ParameterIndexOperationsView(parent.priors, parent._offset_for(self), self.size) self._fixes_ = None - for p in self._parameters_: + for p in self.parameters: p._parent_changed(parent) def _add_to_index_operations(self, which, reconstrained, what, warning): """ Helper preventing copy code. - This addes the given what (transformation, prior etc) to parameter index operations which. + This adds the given what (transformation, prior etc) to parameter index operations which. revonstrained are reconstrained indices. warn when reconstraining parameters if warning is True. TODO: find out which parameters have changed specifically @@ -602,64 +638,122 @@ class Constrainable(Nameable, Indexable, Observable): removed = np.empty((0,), dtype=int) for t in transforms: unconstrained = which.remove(t, self._raveled_index()) - print unconstrained removed = np.union1d(removed, unconstrained) if t is __fixed__: self._highest_parent_._set_unfixed(self, unconstrained) return removed -class OptimizationHandlable(Constrainable): +class OptimizationHandlable(Indexable): """ This enables optimization handles on an Object as done in GPy 0.4. - `..._transformed`: make sure the transformations and constraints etc are handled + `..._optimizer_copy_transformed`: make sure the transformations and constraints etc are handled """ def __init__(self, name, default_constraint=None, *a, **kw): super(OptimizationHandlable, self).__init__(name, default_constraint=default_constraint, *a, **kw) + self._optimizer_copy_ = None + self._optimizer_copy_transformed = False - def transform(self): - [np.put(self.param_array, ind, c.finv(self.param_array.flat[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__] + #=========================================================================== + # Optimizer copy + #=========================================================================== + @property + def optimizer_array(self): + """ + Array for the optimizer to work on. + This array always lives in the space for the optimizer. + Thus, it is untransformed, going from Transformations. - def untransform(self): - [np.put(self.param_array, ind, c.f(self.param_array.flat[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__] + Setting this array, will make sure the transformed parameters for this model + will be set accordingly. It has to be set with an array, retrieved from + this method, as e.g. fixing will resize the array. - def _get_params_transformed(self): - # transformed parameters (apply transformation rules) - p = self.param_array.copy() - [np.put(p, ind, c.finv(p[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__] + The optimizer should only interfere with this array, such that transformations + are secured. + """ + if self.__dict__.get('_optimizer_copy_', None) is None or self.size != self._optimizer_copy_.size: + self._optimizer_copy_ = np.empty(self.size) + + if not self._optimizer_copy_transformed: + self._optimizer_copy_.flat = self.param_array.flat + [np.put(self._optimizer_copy_, ind, c.finv(self.param_array[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__] + if self.has_parent() and (self.constraints[__fixed__].size != 0 or self._has_ties()): + fixes = np.ones(self.size).astype(bool) + fixes[self.constraints[__fixed__]] = FIXED + return self._optimizer_copy_[np.logical_and(fixes, self._highest_parent_.tie.getTieFlag(self))] + elif self._has_fixes(): + return self._optimizer_copy_[self._fixes_] + + self._optimizer_copy_transformed = True + + return self._optimizer_copy_ + + @optimizer_array.setter + def optimizer_array(self, p): + """ + Make sure the optimizer copy does not get touched, thus, we only want to + set the values *inside* not the array itself. + + Also we want to update param_array in here. + """ + f = None if self.has_parent() and self.constraints[__fixed__].size != 0: - fixes = np.ones(self.size).astype(bool) - fixes[self.constraints[__fixed__]] = FIXED - return p[fixes] + f = np.ones(self.size).astype(bool) + f[self.constraints[__fixed__]] = FIXED elif self._has_fixes(): - return p[self._fixes_] - return p + f = self._fixes_ + if f is None: + self.param_array.flat = p + [np.put(self.param_array, ind, c.f(self.param_array.flat[ind])) + for c, ind in self.constraints.iteritems() if c != __fixed__] + else: + self.param_array.flat[f] = p + [np.put(self.param_array, ind[f[ind]], c.f(self.param_array.flat[ind[f[ind]]])) + for c, ind in self.constraints.iteritems() if c != __fixed__] + self._highest_parent_.tie.propagate_val() - def _set_params_transformed(self, p): - if p is self.param_array: - p = p.copy() - if self.has_parent() and self.constraints[__fixed__].size != 0: - fixes = np.ones(self.size).astype(bool) - fixes[self.constraints[__fixed__]] = FIXED - self.param_array.flat[fixes] = p - elif self._has_fixes(): self.param_array.flat[self._fixes_] = p - else: self.param_array.flat = p - self.untransform() + self._optimizer_copy_transformed = False self._trigger_params_changed() + def _get_params_transformed(self): + raise DeprecationWarning, "_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!" +# + def _set_params_transformed(self, p): + raise DeprecationWarning, "_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!" + def _trigger_params_changed(self, trigger_parent=True): - [p._trigger_params_changed(trigger_parent=False) for p in self._parameters_] + """ + First tell all children to update, + then update yourself. + + If trigger_parent is True, we will tell the parent, otherwise not. + """ + [p._trigger_params_changed(trigger_parent=False) for p in self.parameters if not p.is_fixed] self.notify_observers(None, None if trigger_parent else -np.inf) def _size_transformed(self): + """ + As fixes are not passed to the optimiser, the size of the model for the optimiser + is the size of all parameters minus the size of the fixes. + """ return self.size - self.constraints[__fixed__].size + def _transform_gradients(self, g): + """ + Transform the gradients by multiplying the gradient factor for each + constraint to it. + """ + self._highest_parent_.tie.collate_gradient() + [np.put(g, i, g[i] * c.gradfactor(self.param_array[i])) for c, i in self.constraints.iteritems() if c != __fixed__] + if self._has_fixes(): return g[self._fixes_] + return g + @property def num_params(self): """ Return the number of parameters of this parameter_handle. - Param objects will allways return 0. + Param objects will always return 0. """ raise NotImplemented, "Abstract, please implement in respective classes" @@ -673,10 +767,11 @@ class OptimizationHandlable(Constrainable): """ if adjust_for_printing: adjust = lambda x: adjust_name_for_printing(x) else: adjust = lambda x: x - if recursive: names = [xi for x in self._parameters_ for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)] - else: names = [adjust(x.name) for x in self._parameters_] + if recursive: names = [xi for x in self.parameters for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)] + else: names = [adjust(x.name) for x in self.parameters] if add_self: names = map(lambda x: adjust(self.name) + "." + x, names) return names + def _get_param_names(self): n = np.array([p.hierarchy_name() + '[' + str(i) + ']' for p in self.flattened_parameters for i in p._indices()]) return n @@ -690,28 +785,34 @@ class OptimizationHandlable(Constrainable): #=========================================================================== # Randomizeable #=========================================================================== - def randomize(self, rand_gen=np.random.normal, loc=0, scale=1, *args, **kwargs): + def randomize(self, rand_gen=np.random.normal, *args, **kwargs): """ Randomize the model. Make this draw from the prior if one exists, else draw from given random generator - :param rand_gen: numpy random number generator which takes args and kwargs + :param rand_gen: np random number generator which takes args and kwargs :param flaot loc: loc parameter for random number generator :param float scale: scale parameter for random number generator :param args, kwargs: will be passed through to random number generator """ # first take care of all parameters (from N(0,1)) - x = rand_gen(loc=loc, scale=scale, size=self._size_transformed(), *args, **kwargs) + x = rand_gen(size=self._size_transformed(), *args, **kwargs) + self.updates = False # Switch off the updates + self.optimizer_array = x # makes sure all of the tied parameters get the same init (since there's only one prior object...) # now draw from prior where possible + x = self.param_array.copy() [np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.iteritems() if not p is None] - self._set_params_transformed(x) # makes sure all of the tied parameters get the same init (since there's only one prior object...) + unfixlist = np.ones((self.size,),dtype=np.bool) + unfixlist[self.constraints[__fixed__]] = False + self.param_array[unfixlist] = x[unfixlist] + self.updates = True #=========================================================================== # For shared memory arrays. This does nothing in Param, but sets the memory # for all parameterized objects #=========================================================================== @property - def full_gradient(self): + def gradient_full(self): """ Note to users: This does not return the gradient in the right shape! Use self.gradient @@ -725,44 +826,143 @@ class OptimizationHandlable(Constrainable): return self._gradient_array_ def _propagate_param_grad(self, parray, garray): + """ + For propagating the param_array and gradient_array. + This ensures the in memory view of each subsequent array. + + 1.) connect param_array of children to self.param_array + 2.) tell all children to propagate further + """ + if self.param_array.size != self.size: + self._param_array_ = np.empty(self.size, dtype=np.float64) + if self.gradient.size != self.size: + self._gradient_array_ = np.empty(self.size, dtype=np.float64) + pi_old_size = 0 - for pi in self._parameters_: + for pi in self.parameters: pislice = slice(pi_old_size, pi_old_size + pi.size) self.param_array[pislice] = pi.param_array.flat # , requirements=['C', 'W']).flat - self.full_gradient[pislice] = pi.full_gradient.flat # , requirements=['C', 'W']).flat + self.gradient_full[pislice] = pi.gradient_full.flat # , requirements=['C', 'W']).flat pi.param_array.data = parray[pislice].data - pi.full_gradient.data = garray[pislice].data + pi.gradient_full.data = garray[pislice].data pi._propagate_param_grad(parray[pislice], garray[pislice]) pi_old_size += pi.size + def _connect_parameters(self): + pass + class Parameterizable(OptimizationHandlable): + """ + A parameterisable class. + + This class provides the parameters list (ArrayList) and standard parameter handling, + such as {add|remove}_parameter(), traverse hierarchy and param_array, gradient_array + and the empty parameters_changed(). + + This class is abstract and should not be instantiated. + Use GPy.core.Parameterized() as node (or leaf) in the parameterized hierarchy. + Use GPy.core.Param() for a leaf in the parameterized hierarchy. + """ def __init__(self, *args, **kwargs): super(Parameterizable, self).__init__(*args, **kwargs) from GPy.core.parameterization.lists_and_dicts import ArrayList - self._parameters_ = ArrayList() - self.size = 0 + self.parameters = ArrayList() + self._param_array_ = None self._added_names_ = set() - self.ties = {} + self.logger = logging.getLogger(self.__class__.__name__) + self.__visited = False # for traversing in reverse order we need to know if we were here already @property def param_array(self): - if not hasattr(self, '_param_array_'): + """ + Array representing the parameters of this class. + There is only one copy of all parameters in memory, two during optimization. + + !WARNING!: setting the parameter array MUST always be done in memory: + m.param_array[:] = m_copy.param_array + """ + if self.__dict__.get('_param_array_', None) is None: self._param_array_ = np.empty(self.size, dtype=np.float64) return self._param_array_ + @property + def unfixed_param_array(self): + """ + Array representing the parameters of this class. + There is only one copy of all parameters in memory, two during optimization. + + !WARNING!: setting the parameter array MUST always be done in memory: + m.param_array[:] = m_copy.param_array + """ + if self.__dict__.get('_param_array_', None) is None: + self._param_array_ = np.empty(self.size, dtype=np.float64) + + if self.constraints[__fixed__].size !=0: + fixes = np.ones(self.size).astype(bool) + fixes[self.constraints[__fixed__]] = FIXED + return self._param_array_[fixes] + else: + return self._param_array_ + @param_array.setter def param_array(self, arr): self._param_array_ = arr + def traverse(self, visit, *args, **kwargs): + """ + Traverse the hierarchy performing visit(self, *args, **kwargs) + at every node passed by downwards. This function includes self! + + See "visitor pattern" in literature. This is implemented in pre-order fashion. + + Example: + Collect all children: + + children = [] + self.traverse(children.append) + print children + """ + if not self.__visited: + visit(self, *args, **kwargs) + self.__visited = True + for c in self.parameters: + c.traverse(visit, *args, **kwargs) + self.__visited = False + + def traverse_parents(self, visit, *args, **kwargs): + """ + Traverse the hierarchy upwards, visiting all parents and their children except self. + See "visitor pattern" in literature. This is implemented in pre-order fashion. + + Example: + + parents = [] + self.traverse_parents(parents.append) + print parents + """ + if self.has_parent(): + self.__visited = True + self._parent_._traverse_parents(visit, *args, **kwargs) + self.__visited = False + + def _traverse_parents(self, visit, *args, **kwargs): + if not self.__visited: + self.__visited = True + visit(self, *args, **kwargs) + if self.has_parent(): + self._parent_._traverse_parents(visit, *args, **kwargs) + self._parent_.traverse(visit, *args, **kwargs) + self.__visited = False + #========================================================================= # Gradient handling #========================================================================= @property def gradient(self): - if not hasattr(self, '_gradient_array_'): + if self.__dict__.get('_gradient_array_', None) is None: self._gradient_array_ = np.empty(self.size, dtype=np.float64) return self._gradient_array_ @@ -772,26 +972,37 @@ class Parameterizable(OptimizationHandlable): @property def num_params(self): - return len(self._parameters_) + return len(self.parameters) def _add_parameter_name(self, param, ignore_added_names=False): pname = adjust_name_for_printing(param.name) if ignore_added_names: self.__dict__[pname] = param return + + def warn_and_retry(): + print """ + WARNING: added a parameter with formatted name {}, + which is already assigned to {}. + Trying to change the parameter name to + + {}.{} + """.format(pname, self.hierarchy_name(), self.hierarchy_name(), param.name + "_") + param.name += "_" + self._add_parameter_name(param, ignore_added_names) # and makes sure to not delete programmatically added parameters if pname in self.__dict__: if not (param is self.__dict__[pname]): if pname in self._added_names_: del self.__dict__[pname] self._add_parameter_name(param) + else: + warn_and_retry() elif pname not in dir(self): self.__dict__[pname] = param self._added_names_.add(pname) else: - print "WARNING: added a parameter with formatted name {}, which is already a member of {} object. Trying to change the parameter name to\n {}".format(pname, self.__class__, param.name + "_") - param.name += "_" - self._add_parameter_name(param, ignore_added_names) + warn_and_retry() def _remove_parameter_name(self, param=None, pname=None): assert param is None or pname is None, "can only delete either param by name, or the name of a param" @@ -805,155 +1016,33 @@ class Parameterizable(OptimizationHandlable): self._remove_parameter_name(None, old_name) self._add_parameter_name(param) - def add_parameter(self, param, index=None, _ignore_added_names=False): - """ - :param parameters: the parameters to add - :type parameters: list of or one :py:class:`GPy.core.param.Param` - :param [index]: index of where to put parameters - - :param bool _ignore_added_names: whether the name of the parameter overrides a possibly existing field - - Add all parameters to this param class, you can insert parameters - at any given index using the :func:`list.insert` syntax - """ - if param in self._parameters_ and index is not None: - self.remove_parameter(param) - self.add_parameter(param, index) - # elif param.has_parent(): - # raise HierarchyError, "parameter {} already in another model ({}), create new object (or copy) for adding".format(param._short(), param._highest_parent_._short()) - elif param not in self._parameters_: - if param.has_parent(): - parent = param._parent_ - while parent is not None: - if parent is self: - raise HierarchyError, "You cannot add a parameter twice into the hierarchy" - parent = parent._parent_ - param._parent_.remove_parameter(param) - # make sure the size is set - if index is None: - self.constraints.update(param.constraints, self.size) - self.priors.update(param.priors, self.size) - self._parameters_.append(param) - else: - start = sum(p.size for p in self._parameters_[:index]) - self.constraints.shift_right(start, param.size) - self.priors.shift_right(start, param.size) - self.constraints.update(param.constraints, start) - self.priors.update(param.priors, start) - self._parameters_.insert(index, param) - - param.add_observer(self, self._pass_through_notify_observers, -np.inf) - - parent = self - while parent is not None: - parent.size += param.size - parent = parent._parent_ - - self._connect_parameters() - - self._highest_parent_._connect_parameters(ignore_added_names=_ignore_added_names) - self._highest_parent_._notify_parent_change() - self._highest_parent_._connect_fixes() - - else: - raise HierarchyError, """Parameter exists already and no copy made""" - - - def add_parameters(self, *parameters): - """ - convenience method for adding several - parameters without gradient specification - """ - [self.add_parameter(p) for p in parameters] - - def remove_parameter(self, param): - """ - :param param: param object to remove from being a parameter of this parameterized object. - """ - if not param in self._parameters_: - raise RuntimeError, "Parameter {} does not belong to this object, remove parameters directly from their respective parents".format(param._short()) - - start = sum([p.size for p in self._parameters_[:param._parent_index_]]) - self._remove_parameter_name(param) - self.size -= param.size - del self._parameters_[param._parent_index_] - - param._disconnect_parent() - param.remove_observer(self, self._pass_through_notify_observers) - self.constraints.shift_left(start, param.size) - - self._connect_parameters() - self._notify_parent_change() - - parent = self._parent_ - while parent is not None: - parent.size -= param.size - parent = parent._parent_ - - self._highest_parent_._connect_parameters() - self._highest_parent_._connect_fixes() - self._highest_parent_._notify_parent_change() - - def _connect_parameters(self, ignore_added_names=False): - # connect parameterlist to this parameterized object - # This just sets up the right connection for the params objects - # to be used as parameters - # it also sets the constraints for each parameter to the constraints - # of their respective parents - if not hasattr(self, "_parameters_") or len(self._parameters_) < 1: - # no parameters for this class - return - old_size = 0 - self.param_array = np.empty(self.size, dtype=np.float64) - self._gradient_array_ = np.empty(self.size, dtype=np.float64) - - self._param_slices_ = [] - for i, p in enumerate(self._parameters_): - p._parent_ = self - p._parent_index_ = i - - pslice = slice(old_size, old_size + p.size) - # first connect all children - p._propagate_param_grad(self.param_array[pslice], self.full_gradient[pslice]) - # then connect children to self - self.param_array[pslice] = p.param_array.flat # , requirements=['C', 'W']).ravel(order='C') - self.full_gradient[pslice] = p.full_gradient.flat # , requirements=['C', 'W']).ravel(order='C') - - if not p.param_array.flags['C_CONTIGUOUS']: - raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS" - p.param_array.data = self.param_array[pslice].data - p.full_gradient.data = self.full_gradient[pslice].data - - self._param_slices_.append(pslice) - - self._add_parameter_name(p, ignore_added_names=ignore_added_names) - old_size += p.size + def __setstate__(self, state): + super(Parameterizable, self).__setstate__(state) + self.logger = logging.getLogger(self.__class__.__name__) + return self #=========================================================================== # notification system #=========================================================================== def _parameters_changed_notification(self, me, which=None): + """ + In parameterizable we just need to make sure, that the next call to optimizer_array + will update the optimizer_array to the latest parameters + """ + self._optimizer_copy_transformed = False # tells the optimizer array to update on next request self.parameters_changed() def _pass_through_notify_observers(self, me, which=None): self.notify_observers(which=which) + def _setup_observers(self): + """ + Setup the default observers - #=========================================================================== - # Pickling - #=========================================================================== - def __setstate__(self, state): - super(Parameterizable, self).__setstate__(state) - self._connect_parameters() - self._connect_fixes() - self._notify_parent_change() - - self.parameters_changed() - - def copy(self): - c = super(Parameterizable, self).copy() - c._connect_parameters() - c._connect_fixes() - c._notify_parent_change() - return c + 1: parameters_changed_notify + 2: pass through to parent, if present + """ + self.add_observer(self, self._parameters_changed_notification, -100) + if self.has_parent(): + self.add_observer(self._parent_, self._parent_._pass_through_notify_observers, -np.inf) #=========================================================================== # From being parentable, we have to define the parent_change notification #=========================================================================== @@ -961,7 +1050,7 @@ class Parameterizable(OptimizationHandlable): """ Notify all parameters that the parent has changed """ - for p in self._parameters_: + for p in self.parameters: p._parent_changed(self) def parameters_changed(self): @@ -972,4 +1061,3 @@ class Parameterizable(OptimizationHandlable): updates get passed through. See :py:function:``GPy.core.param.Observable.add_observer`` """ pass - diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py index 82ad3753..b8eb4528 100644 --- a/GPy/core/parameterization/parameterized.py +++ b/GPy/core/parameterization/parameterized.py @@ -2,19 +2,28 @@ # Licensed under the BSD 3-clause license (see LICENSE.txt) import numpy; np = numpy -import cPickle import itertools from re import compile, _pattern_type from param import ParamConcatenation -from parameter_core import Pickleable, Parameterizable, adjust_name_for_printing -from transformations import __fixed__ -from lists_and_dicts import ArrayList +from parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing + +import logging +logger = logging.getLogger("parameters changed meta") class ParametersChangedMeta(type): def __call__(self, *args, **kw): - instance = super(ParametersChangedMeta, self).__call__(*args, **kw) - instance.parameters_changed() - return instance + self._in_init_ = True + #import ipdb;ipdb.set_trace() + self = super(ParametersChangedMeta, self).__call__(*args, **kw) + logger.debug("finished init") + self._in_init_ = False + logger.debug("connecting parameters") + self._highest_parent_._connect_parameters() + self._highest_parent_._notify_parent_change() + self._highest_parent_._connect_fixes() + logger.debug("calling parameters changed") + self.parameters_changed() + return self class Parameterized(Parameterizable): """ @@ -59,37 +68,34 @@ class Parameterized(Parameterizable): and concatenate them. Printing m[''] will result in printing of all parameters in detail. """ #=========================================================================== - # Metaclass for parameters changed after init. + # Metaclass for parameters changed after init. # This makes sure, that parameters changed will always be called after __init__ - # **Never** call parameters_changed() yourself + # **Never** call parameters_changed() yourself __metaclass__ = ParametersChangedMeta #=========================================================================== def __init__(self, name=None, parameters=[], *a, **kw): super(Parameterized, self).__init__(name=name, *a, **kw) - self._in_init_ = True - self._parameters_ = ArrayList() - self.size = sum(p.size for p in self._parameters_) + self.size = sum(p.size for p in self.parameters) self.add_observer(self, self._parameters_changed_notification, -100) if not self._has_fixes(): self._fixes_ = None self._param_slices_ = [] - self._connect_parameters() - del self._in_init_ + #self._connect_parameters() self.add_parameters(*parameters) def build_pydot(self, G=None): import pydot # @UnresolvedImport iamroot = False if G is None: - G = pydot.Dot(graph_type='digraph') + G = pydot.Dot(graph_type='digraph', bgcolor=None) iamroot=True - node = pydot.Node(id(self), shape='record', label=self.name) + node = pydot.Node(id(self), shape='box', label=self.name)#, color='white') G.add_node(node) - for child in self._parameters_: + for child in self.parameters: child_node = child.build_pydot(G) - G.add_edge(pydot.Edge(node, child_node)) + G.add_edge(pydot.Edge(node, child_node))#, color='white')) - for o in self.observers.keys(): + for _, o, _ in self.observers: label = o.name if hasattr(o, 'name') else str(o) observed_node = pydot.Node(id(o), label=label) G.add_node(observed_node) @@ -101,58 +107,143 @@ class Parameterized(Parameterizable): return node #=========================================================================== - # Gradient control + # Add remove parameters: #=========================================================================== - def _transform_gradients(self, g): - if self.has_parent(): - return g - [numpy.put(g, i, g[i] * c.gradfactor(self.param_array[i])) for c, i in self.constraints.iteritems() if c != __fixed__] - if self._has_fixes(): return g[self._fixes_] - return g - - - #=========================================================================== - # Indexable - #=========================================================================== - def _offset_for(self, param): - # get the offset in the parameterized index array for param - if param.has_parent(): - if param._parent_._get_original(param) in self._parameters_: - return self._param_slices_[param._parent_._get_original(param)._parent_index_].start - return self._offset_for(param._parent_) + param._parent_._offset_for(param) - return 0 - - def _raveled_index_for(self, param): + def add_parameter(self, param, index=None, _ignore_added_names=False): """ - get the raveled index for a param - that is an int array, containing the indexes for the flattened - param inside this parameterized logic. - """ - if isinstance(param, ParamConcatenation): - return numpy.hstack((self._raveled_index_for(p) for p in param.params)) - return param._raveled_index() + self._offset_for(param) + :param parameters: the parameters to add + :type parameters: list of or one :py:class:`GPy.core.param.Param` + :param [index]: index of where to put parameters - def _raveled_index(self): - """ - get the raveled index for this object, - this is not in the global view of things! - """ - return numpy.r_[:self.size] + :param bool _ignore_added_names: whether the name of the parameter overrides a possibly existing field - #=========================================================================== - # Convenience for fixed, tied checking of param: - #=========================================================================== - @property - def is_fixed(self): - for p in self._parameters_: - if not p.is_fixed: return False - return True + Add all parameters to this param class, you can insert parameters + at any given index using the :func:`list.insert` syntax + """ + if param in self.parameters and index is not None: + self.remove_parameter(param) + self.add_parameter(param, index) + # elif param.has_parent(): + # raise HierarchyError, "parameter {} already in another model ({}), create new object (or copy) for adding".format(param._short(), param._highest_parent_._short()) + elif param not in self.parameters: + if param.has_parent(): + def visit(parent, self): + if parent is self: + raise HierarchyError, "You cannot add a parameter twice into the hierarchy" + param.traverse_parents(visit, self) + param._parent_.remove_parameter(param) + # make sure the size is set + if index is None: + start = sum(p.size for p in self.parameters) + self.constraints.shift_right(start, param.size) + self.priors.shift_right(start, param.size) + self.constraints.update(param.constraints, self.size) + self.priors.update(param.priors, self.size) + self.parameters.append(param) + else: + start = sum(p.size for p in self.parameters[:index]) + self.constraints.shift_right(start, param.size) + self.priors.shift_right(start, param.size) + self.constraints.update(param.constraints, start) + self.priors.update(param.priors, start) + self.parameters.insert(index, param) - def _get_original(self, param): - # if advanced indexing is activated it happens that the array is a copy - # you can retrieve the original param through this method, by passing - # the copy here - return self._parameters_[param._parent_index_] + param.add_observer(self, self._pass_through_notify_observers, -np.inf) + + parent = self + while parent is not None: + parent.size += param.size + parent = parent._parent_ + + if not self._in_init_: + self._connect_parameters() + self._notify_parent_change() + + self._highest_parent_._connect_parameters(ignore_added_names=_ignore_added_names) + self._highest_parent_._notify_parent_change() + self._highest_parent_._connect_fixes() + + else: + raise HierarchyError, """Parameter exists already, try making a copy""" + + + def add_parameters(self, *parameters): + """ + convenience method for adding several + parameters without gradient specification + """ + [self.add_parameter(p) for p in parameters] + + def remove_parameter(self, param): + """ + :param param: param object to remove from being a parameter of this parameterized object. + """ + if not param in self.parameters: + try: + raise RuntimeError, "{} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name) + except AttributeError: + raise RuntimeError, "{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param)) + + start = sum([p.size for p in self.parameters[:param._parent_index_]]) + self._remove_parameter_name(param) + self.size -= param.size + del self.parameters[param._parent_index_] + + param._disconnect_parent() + param.remove_observer(self, self._pass_through_notify_observers) + self.constraints.shift_left(start, param.size) + + self._connect_parameters() + self._notify_parent_change() + + parent = self._parent_ + while parent is not None: + parent.size -= param.size + parent = parent._parent_ + + self._highest_parent_._connect_parameters() + self._highest_parent_._connect_fixes() + self._highest_parent_._notify_parent_change() + + def _connect_parameters(self, ignore_added_names=False): + # connect parameterlist to this parameterized object + # This just sets up the right connection for the params objects + # to be used as parameters + # it also sets the constraints for each parameter to the constraints + # of their respective parents + if not hasattr(self, "parameters") or len(self.parameters) < 1: + # no parameters for this class + return + if self.param_array.size != self.size: + self._param_array_ = np.empty(self.size, dtype=np.float64) + if self.gradient.size != self.size: + self._gradient_array_ = np.empty(self.size, dtype=np.float64) + + old_size = 0 + self._param_slices_ = [] + for i, p in enumerate(self.parameters): + if not p.param_array.flags['C_CONTIGUOUS']: + raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS" + + p._parent_ = self + p._parent_index_ = i + + pslice = slice(old_size, old_size + p.size) + + # first connect all children + p._propagate_param_grad(self.param_array[pslice], self.gradient_full[pslice]) + + # then connect children to self + self.param_array[pslice] = p.param_array.flat # , requirements=['C', 'W']).ravel(order='C') + self.gradient_full[pslice] = p.gradient_full.flat # , requirements=['C', 'W']).ravel(order='C') + + p.param_array.data = self.param_array[pslice].data + p.gradient_full.data = self.gradient_full[pslice].data + + self._param_slices_.append(pslice) + + self._add_parameter_name(p, ignore_added_names=ignore_added_names) + old_size += p.size #=========================================================================== # Get/set parameters: @@ -199,10 +290,38 @@ class Parameterized(Parameterizable): def __setattr__(self, name, val): # override the default behaviour, if setting a param, so broadcasting can by used - if hasattr(self, "_parameters_"): - pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False) - if name in pnames: self._parameters_[pnames.index(name)][:] = val; return + if hasattr(self, "parameters"): + try: + pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False) + if name in pnames: self.parameters[pnames.index(name)][:] = val; return + except AttributeError: + pass object.__setattr__(self, name, val); + + #=========================================================================== + # Pickling + #=========================================================================== + def __setstate__(self, state): + super(Parameterized, self).__setstate__(state) + try: + self._connect_parameters() + self._connect_fixes() + self._notify_parent_change() + self.parameters_changed() + except Exception as e: + print "WARNING: caught exception {!s}, trying to continue".format(e) + + def copy(self, memo=None): + if memo is None: + memo = {} + memo[id(self.optimizer_array)] = None # and param_array + memo[id(self.param_array)] = None # and param_array + copy = super(Parameterized, self).copy(memo) + copy._connect_parameters() + copy._connect_fixes() + copy._notify_parent_change() + return copy + #=========================================================================== # Printing: #=========================================================================== @@ -210,29 +329,29 @@ class Parameterized(Parameterizable): return self.hierarchy_name() @property def flattened_parameters(self): - return [xi for x in self._parameters_ for xi in x.flattened_parameters] + return [xi for x in self.parameters for xi in x.flattened_parameters] @property def _parameter_sizes_(self): - return [x.size for x in self._parameters_] + return [x.size for x in self.parameters] @property def parameter_shapes(self): - return [xi for x in self._parameters_ for xi in x.parameter_shapes] + return [xi for x in self.parameters for xi in x.parameter_shapes] @property def _constraints_str(self): - return [cs for p in self._parameters_ for cs in p._constraints_str] + return [cs for p in self.parameters for cs in p._constraints_str] @property def _priors_str(self): - return [cs for p in self._parameters_ for cs in p._priors_str] + return [cs for p in self.parameters for cs in p._priors_str] @property def _description_str(self): - return [xi for x in self._parameters_ for xi in x._description_str] + return [xi for x in self.parameters for xi in x._description_str] @property def _ties_str(self): return [','.join(x._ties_str) for x in self.flattened_parameters] def __str__(self, header=True): name = adjust_name_for_printing(self.name) + "." - constrs = self._constraints_str; + constrs = self._constraints_str; ts = self._ties_str prirs = self._priors_str desc = self._description_str; names = self.parameter_names() @@ -245,7 +364,7 @@ class Parameterized(Parameterizable): to_print = [] for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs): to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p)) - # to_print = [format_spec.format(p=p, const=c, t=t) if isinstance(p, Param) else p.__str__(header=False) for p, c, t in itertools.izip(self._parameters_, constrs, ts)] + # to_print = [format_spec.format(p=p, const=c, t=t) if isinstance(p, Param) else p.__str__(header=False) for p, c, t in itertools.izip(self.parameters, constrs, ts)] sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3) if header: header = " {{0:<{0}s}} | {{1:^{1}s}} | {{2:^{2}s}} | {{3:^{3}s}} | {{4:^{4}s}}".format(nl, sl, cl, pl, tl).format(name, "Value", "Constraint", "Prior", "Tied to") diff --git a/GPy/core/parameterization/priors.py b/GPy/core/parameterization/priors.py index 29adc923..ddc4d02f 100644 --- a/GPy/core/parameterization/priors.py +++ b/GPy/core/parameterization/priors.py @@ -76,11 +76,11 @@ class Uniform(Prior): o = super(Prior, cls).__new__(cls, lower, upper) cls._instances.append(weakref.ref(o)) return cls._instances[-1]() - + def __init__(self, lower, upper): self.lower = float(lower) self.upper = float(upper) - + def __str__(self): return "[" + str(np.round(self.lower)) + ', ' + str(np.round(self.upper)) + ']' @@ -93,7 +93,7 @@ class Uniform(Prior): def rvs(self, n): return np.random.uniform(self.lower, self.upper, size=n) - + class LogGaussian(Prior): """ Implementation of the univariate *log*-Gaussian probability function, coupled with random variables. @@ -246,7 +246,7 @@ class Gamma(Prior): """ Creates an instance of a Gamma Prior by specifying the Expected value(s) and Variance(s) of the distribution. - + :param E: expected value :param V: variance """ diff --git a/GPy/core/parameterization/ties_and_remappings.py b/GPy/core/parameterization/ties_and_remappings.py index 75b46a95..a81b8d61 100644 --- a/GPy/core/parameterization/ties_and_remappings.py +++ b/GPy/core/parameterization/ties_and_remappings.py @@ -31,48 +31,193 @@ class Fix(Remapping): -class Tie(Remapping): - def __init__(self, value, name): +class Tie(Parameterized): + """ + The new parameter tie framework. (under development) + + All the parameters tied together get a new parameter inside the *Tie* object. + Its value should always be equal to all the tied parameters, and its gradient + is the sum of all the tied parameters. + + =====Implementation Details===== + The *Tie* object should only exist on the top of param tree (the highest parent). + + self.label_buf: + It uses a label buffer that has the same length as all the parameters (self._highest_parent_.param_array). + The buffer keeps track of all the tied parameters. All the tied parameters have a label (an interger) higher + than 0, and the parameters that have the same label are tied together. + + self.buf_index: + An auxiliary index list for the global index of the tie parameter inside the *Tie* object. + + ================================ + + TODO: + * EVERYTHING + + """ + def __init__(self, name='tie'): super(Tie, self).__init__(name) - self.tied_parameters = [] - self.value = Param('val', value) - self.add_parameter(self.value) + self.tied_param = None + # The buffer keeps track of tie status + self.label_buf = None + # The global indices of the 'tied' param + self.buf_idx = None + # A boolean array indicating non-tied parameters + self._tie_ = None + + def getTieFlag(self, p=None): + if self.tied_param is None: + if self._tie_ is None or self._tie_.size != self._highest_parent_.param_array.size: + self._tie_ = np.ones((self._highest_parent_.param_array.size,),dtype=np.bool) + if p is not None: + return self._tie_[p._highest_parent_._raveled_index_for(p)] + return self._tie_ + + def _init_labelBuf(self): + if self.label_buf is None: + self.label_buf = np.zeros(self._highest_parent_.param_array.shape, dtype=np.int) + if self._tie_ is None or self._tie_.size != self._highest_parent_.param_array.size: + self._tie_ = np.ones((self._highest_parent_.param_array.size,),dtype=np.bool) + + def _updateTieFlag(self): + if self._tie_.size != self.label_buf.size: + self._tie_ = np.ones((self._highest_parent_.param_array.size,),dtype=np.bool) + self._tie_[self.label_buf>0] = False + self._tie_[self.buf_idx] = True - def add_tied_parameter(self, p): - self.tied_parameters.append(p) - p.add_observer(self, self.callback) - self.parameters_changed() - - def callback(self, param=None, which=None): + def add_tied_parameter(self, p, p2=None): """ - This gets called whenever any of the tied parameters changes. we spend - considerable effort working out whhat has changed ant to what value. - Then we store that value in self.value, and broadcast it everywhere - with parameters_changed. + Tie the list of parameters p together (p2==None) or + Tie the list of parameters p with the list of parameters p2 (p2!=None) """ - if which is self:return - index = self._highest_parent_.constraints[self] - if len(index)==0: - return # nothing to tie together, this tie exists without any tied parameters - vals = self._highest_parent_.param_array[index] - uvals = np.unique(vals) - if len(uvals)==1: - #all of the tied things are at the same value - self.value[...] = uvals[0] - elif len(uvals)==2: - #only *one* of the tied things has changed. it must be different to self.value - newval = uvals[uvals != self.value*1] - self.value[...] = newval + self._init_labelBuf() + if p2 is None: + idx = self._highest_parent_._raveled_index_for(p) + val = self._sync_val_group(idx) + if np.all(self.label_buf[idx]==0): + # None of p has been tied before. + tie_idx = self._expandTieParam(1) + print tie_idx + tie_id = self.label_buf.max()+1 + self.label_buf[tie_idx] = tie_id + else: + b = self.label_buf[idx] + ids = np.unique(b[b>0]) + tie_id, tie_idx = self._merge_tie_param(ids) + self._highest_parent_.param_array[tie_idx] = val + idx = self._highest_parent_._raveled_index_for(p) + self.label_buf[idx] = tie_id else: - #more than one of the tied things changed. panic. - raise ValueError, "something is wrong with the tieing" + pass + self._updateTieFlag() + + def _merge_tie_param(self, ids): + """Merge the tie parameters with ids in the list.""" + if len(ids)==1: + id_final_idx = self.buf_idx[self.label_buf[self.buf_idx]==ids[0]][0] + return ids[0],id_final_idx + id_final = ids[0] + ids_rm = ids[1:] + label_buf_param = self.label_buf[self.buf_idx] + idx_param = [np.where(label_buf_param==i)[0][0] for i in ids_rm] + self._removeTieParam(idx_param) + [np.put(self.label_buf, np.where(self.label_buf==i), id_final) for i in ids_rm] + id_final_idx = self.buf_idx[self.label_buf[self.buf_idx]==id_final][0] + return id_final, id_final_idx + + def _sync_val_group(self, idx): + self._highest_parent_.param_array[idx] = self._highest_parent_.param_array[idx].mean() + return self._highest_parent_.param_array[idx][0] + + def _expandTieParam(self, num): + """Expand the tie param with the number of *num* parameters""" + if self.tied_param is None: + new_buf = np.empty((num,)) + else: + new_buf = np.empty((self.tied_param.size+num,)) + new_buf[:self.tied_param.size] = self.tied_param.param_array.copy() + self.remove_parameter(self.tied_param) + self.tied_param = Param('tied',new_buf) + self.add_parameter(self.tied_param) + buf_idx_new = self._highest_parent_._raveled_index_for(self.tied_param) + self._expand_label_buf(self.buf_idx, buf_idx_new) + self.buf_idx = buf_idx_new + return self.buf_idx[-num:] - def mapping(self): - return self.value + def _removeTieParam(self, idx): + """idx within tied_param""" + new_buf = np.empty((self.tied_param.size-len(idx),)) + bool_list = np.ones((self.tied_param.size,),dtype=np.bool) + bool_list[idx] = False + new_buf[:] = self.tied_param.param_array[bool_list] + self.remove_parameter(self.tied_param) + self.tied_param = Param('tied',new_buf) + self.add_parameter(self.tied_param) + buf_idx_new = self._highest_parent_._raveled_index_for(self.tied_param) + self._shrink_label_buf(self.buf_idx, buf_idx_new, bool_list) + self.buf_idx = buf_idx_new + + def _expand_label_buf(self, idx_old, idx_new): + """Expand label buffer accordingly""" + if idx_old is None: + self.label_buf = np.zeros(self._highest_parent_.param_array.shape, dtype=np.int) + else: + bool_old = np.zeros((self.label_buf.size,),dtype=np.bool) + bool_old[idx_old] = True + bool_new = np.zeros((self._highest_parent_.param_array.size,),dtype=np.bool) + bool_new[idx_new] = True + label_buf_new = np.zeros(self._highest_parent_.param_array.shape, dtype=np.int) + label_buf_new[np.logical_not(bool_new)] = self.label_buf[np.logical_not(bool_old)] + label_buf_new[idx_new[:len(idx_old)]] = self.label_buf[idx_old] + self.label_buf = label_buf_new + + def _shrink_label_buf(self, idx_old, idx_new, bool_list): + bool_old = np.zeros((self.label_buf.size,),dtype=np.bool) + bool_old[idx_old] = True + bool_new = np.zeros((self._highest_parent_.param_array.size,),dtype=np.bool) + bool_new[idx_new] = True + label_buf_new = np.empty(self._highest_parent_.param_array.shape, dtype=np.int) + label_buf_new[np.logical_not(bool_new)] = self.label_buf[np.logical_not(bool_old)] + label_buf_new[idx_new] = self.label_buf[idx_old[bool_list]] + self.label_buf = label_buf_new + + def _check_change(self): + changed = False + if self.tied_param is not None: + for i in xrange(self.tied_param.size): + b0 = self.label_buf==self.label_buf[self.buf_idx[i]] + b = self._highest_parent_.param_array[b0]!=self.tied_param[i] + if b.sum()==0: + print 'XXX' + continue + elif b.sum()==1: + print '!!!' + val = self._highest_parent_.param_array[b0][b][0] + self._highest_parent_.param_array[b0] = val + else: + print '@@@' + self._highest_parent_.param_array[b0] = self.tied_param[i] + changed = True + return changed + + def parameters_changed(self): + #ensure all out parameters have the correct value, as specified by our mapping + changed = self._check_change() + if changed: + self._highest_parent_._trigger_params_changed() + self.collate_gradient() def collate_gradient(self): - index = self._highest_parent_.constraints[self] - self.value.gradient = np.sum(self._highest_parent_.gradient[index]) + if self.tied_param is not None: + self.tied_param.gradient = 0. + [np.put(self.tied_param.gradient, i, self._highest_parent_.gradient[self.label_buf==self.label_buf[self.buf_idx[i]]].sum()) + for i in xrange(self.tied_param.size)] + + def propagate_val(self): + if self.tied_param is not None: + for i in xrange(self.tied_param.size): + self._highest_parent_.param_array[self.label_buf==self.label_buf[self.buf_idx[i]]] = self.tied_param[i] diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py index 506d80cd..53d4301d 100644 --- a/GPy/core/parameterization/transformations.py +++ b/GPy/core/parameterization/transformations.py @@ -54,7 +54,7 @@ class Transformation(object): class Logexp(Transformation): domain = _POSITIVE def f(self, x): - return np.where(x>_lim_val, x, np.log(1. + np.exp(np.clip(x, -_lim_val, _lim_val)))) + epsilon + return np.where(x>_lim_val, x, np.log1p(np.exp(np.clip(x, -_lim_val, _lim_val)))) + epsilon #raises overflow warning: return np.where(x>_lim_val, x, np.log(1. + np.exp(x))) def finv(self, f): return np.where(f>_lim_val, f, np.log(np.exp(f+1e-20) - 1.)) @@ -195,6 +195,9 @@ class Logistic(Transformation): self.lower, self.upper = float(lower), float(upper) self.difference = self.upper - self.lower def f(self, x): + if (x<-300.).any(): + x = x.copy() + x[x<-300.] = -300. return self.lower + self.difference / (1. + np.exp(-x)) def finv(self, f): return np.log(np.clip(f - self.lower, 1e-10, np.inf) / np.clip(self.upper - f, 1e-10, np.inf)) diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py index 3730baed..5412a70d 100644 --- a/GPy/core/parameterization/variational.py +++ b/GPy/core/parameterization/variational.py @@ -34,36 +34,49 @@ class NormalPrior(VariationalPrior): variational_posterior.variance.gradient -= (1. - (1. / (variational_posterior.variance))) * 0.5 class SpikeAndSlabPrior(VariationalPrior): - def __init__(self, pi, variance = 1.0, name='SpikeAndSlabPrior', **kw): + def __init__(self, pi=None, learnPi=False, variance = 1.0, name='SpikeAndSlabPrior', **kw): super(VariationalPrior, self).__init__(name=name, **kw) - assert variance==1.0, "Not Implemented!" self.pi = Param('pi', pi, Logistic(1e-10,1.-1e-10)) self.variance = Param('variance',variance) - self.add_parameters(self.pi) - self.group_spike_prob = False + self.learnPi = learnPi + if learnPi: + self.add_parameters(self.pi) def KL_divergence(self, variational_posterior): mu = variational_posterior.mean S = variational_posterior.variance gamma = variational_posterior.binary_prob - var_mean = np.square(mu) - var_S = (S - np.log(S)) - var_gamma = (gamma*np.log(gamma/self.pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-self.pi))).sum() - return var_gamma+ 0.5 * (gamma* (var_mean + var_S -1)).sum() + if len(self.pi.shape)==2: + idx = np.unique(gamma._raveled_index()/gamma.shape[-1]) + pi = self.pi[idx] + else: + pi = self.pi + + var_mean = np.square(mu)/self.variance + var_S = (S/self.variance - np.log(S)) + var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum() + return var_gamma+ (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2. def update_gradients_KL(self, variational_posterior): mu = variational_posterior.mean S = variational_posterior.variance gamma = variational_posterior.binary_prob - - if self.group_spike_prob: - gamma_grad = np.log((1-self.pi)/self.pi*gamma/(1.-gamma))+(np.square(mu)+S-np.log(S)-1.)/2. - gamma.gradient -= gamma_grad.mean(axis=0) + if len(self.pi.shape)==2: + idx = np.unique(gamma._raveled_index()/gamma.shape[-1]) + pi = self.pi[idx] else: - gamma.gradient -= np.log((1-self.pi)/self.pi*gamma/(1.-gamma))+(np.square(mu)+S-np.log(S)-1.)/2. - mu.gradient -= gamma*mu - S.gradient -= (1. - (1. / (S))) * gamma /2. - self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum(axis=0) + pi = self.pi + + gamma.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2. + mu.gradient -= gamma*mu/self.variance + S.gradient -= (1./self.variance - 1./S) * gamma /2. + if self.learnPi: + if len(self.pi)==1: + self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum() + elif len(self.pi.shape)==1: + self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum(axis=0) + else: + self.pi[idx].gradient = (gamma/self.pi[idx] - (1.-gamma)/(1.-self.pi[idx])) class VariationalPosterior(Parameterized): def __init__(self, means=None, variances=None, name='latent space', *a, **kw): @@ -81,7 +94,7 @@ class VariationalPosterior(Parameterized): def _raveled_index(self): index = np.empty(dtype=int, shape=0) size = 0 - for p in self._parameters_: + for p in self.parameters: index = np.hstack((index, p._raveled_index()+size)) size += p._realsize_ if hasattr(p, '_realsize_') else p.size return index @@ -96,17 +109,20 @@ class VariationalPosterior(Parameterized): dc = self.__dict__.copy() dc['mean'] = self.mean[s] dc['variance'] = self.variance[s] - dc['_parameters_'] = copy.copy(self._parameters_) + dc['parameters'] = copy.copy(self.parameters) n.__dict__.update(dc) - n._parameters_[dc['mean']._parent_index_] = dc['mean'] - n._parameters_[dc['variance']._parent_index_] = dc['variance'] + n.parameters[dc['mean']._parent_index_] = dc['mean'] + n.parameters[dc['variance']._parent_index_] = dc['variance'] + n._gradient_array_ = None + oversize = self.size - self.mean.size - self.variance.size + n.size = n.mean.size + n.variance.size + oversize n.ndim = n.mean.ndim n.shape = n.mean.shape n.num_data = n.mean.shape[0] n.input_dim = n.mean.shape[1] if n.ndim != 1 else 1 return n else: - return super(VariationalPrior, self).__getitem__(s) + return super(VariationalPosterior, self).__getitem__(s) class NormalPosterior(VariationalPosterior): ''' @@ -147,11 +163,14 @@ class SpikeAndSlabPosterior(VariationalPosterior): dc['mean'] = self.mean[s] dc['variance'] = self.variance[s] dc['binary_prob'] = self.binary_prob[s] - dc['_parameters_'] = copy.copy(self._parameters_) + dc['parameters'] = copy.copy(self.parameters) n.__dict__.update(dc) - n._parameters_[dc['mean']._parent_index_] = dc['mean'] - n._parameters_[dc['variance']._parent_index_] = dc['variance'] - n._parameters_[dc['binary_prob']._parent_index_] = dc['binary_prob'] + n.parameters[dc['mean']._parent_index_] = dc['mean'] + n.parameters[dc['variance']._parent_index_] = dc['variance'] + n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob'] + n._gradient_array_ = None + oversize = self.size - self.mean.size - self.variance.size + n.size = n.mean.size + n.variance.size + oversize n.ndim = n.mean.ndim n.shape = n.mean.shape n.num_data = n.mean.shape[0] @@ -160,7 +179,7 @@ class SpikeAndSlabPosterior(VariationalPosterior): else: return super(VariationalPrior, self).__getitem__(s) - def plot(self, *args): + def plot(self, *args, **kwargs): """ Plot latent space X in 1D: @@ -169,4 +188,4 @@ class SpikeAndSlabPosterior(VariationalPosterior): import sys assert "matplotlib" in sys.modules, "matplotlib package has not been imported." from ...plotting.matplot_dep import variational_plots - return variational_plots.plot_SpikeSlab(self,*args) + return variational_plots.plot_SpikeSlab(self,*args, **kwargs) diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py index b01d39c0..358db125 100644 --- a/GPy/core/sparse_gp.py +++ b/GPy/core/sparse_gp.py @@ -8,6 +8,9 @@ from ..inference.latent_function_inference import var_dtc from .. import likelihoods from parameterization.variational import VariationalPosterior +import logging +logger = logging.getLogger("sparse gp") + class SparseGP(GP): """ A general purpose Sparse GP model @@ -31,7 +34,7 @@ class SparseGP(GP): """ - def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, name='sparse gp', Y_metadata=None): + def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, name='sparse gp', Y_metadata=None, normalizer=False): #pick a sensible inference method if inference_method is None: @@ -45,28 +48,36 @@ class SparseGP(GP): self.Z = Param('inducing inputs', Z) self.num_inducing = Z.shape[0] - GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata) - + GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer) + logger.info("Adding Z as parameter") self.add_parameter(self.Z, index=0) def has_uncertain_inputs(self): return isinstance(self.X, VariationalPosterior) def parameters_changed(self): - self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata) + self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata) self.likelihood.update_gradients(self.grad_dict['dL_dthetaL']) if isinstance(self.X, VariationalPosterior): #gradients wrt kernel - dL_dKmm = self.grad_dict.pop('dL_dKmm') + dL_dKmm = self.grad_dict['dL_dKmm'] self.kern.update_gradients_full(dL_dKmm, self.Z, None) target = self.kern.gradient.copy() - self.kern.update_gradients_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsi2=self.grad_dict['dL_dpsi2']) + self.kern.update_gradients_expectations(variational_posterior=self.X, + Z=self.Z, + dL_dpsi0=self.grad_dict['dL_dpsi0'], + dL_dpsi1=self.grad_dict['dL_dpsi1'], + dL_dpsi2=self.grad_dict['dL_dpsi2']) self.kern.gradient += target #gradients wrt Z self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z) self.Z.gradient += self.kern.gradients_Z_expectations( - self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X) + self.grad_dict['dL_dpsi0'], + self.grad_dict['dL_dpsi1'], + self.grad_dict['dL_dpsi2'], + Z=self.Z, + variational_posterior=self.X) else: #gradients wrt kernel self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X) @@ -107,5 +118,3 @@ class SparseGP(GP): psi2 = kern.psi2(self.Z, Xnew) var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1) return mu, var - - diff --git a/GPy/core/sparse_gp_mpi.py b/GPy/core/sparse_gp_mpi.py new file mode 100644 index 00000000..7910cb71 --- /dev/null +++ b/GPy/core/sparse_gp_mpi.py @@ -0,0 +1,115 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +from sparse_gp import SparseGP +from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch + +import logging +logger = logging.getLogger("sparse gp mpi") + +class SparseGP_MPI(SparseGP): + """ + A general purpose Sparse GP model with MPI parallelization support + + This model allows (approximate) inference using variational DTC or FITC + (Gaussian likelihoods) as well as non-conjugate sparse methods based on + these. + + :param X: inputs + :type X: np.ndarray (num_data x input_dim) + :param likelihood: a likelihood instance, containing the observed data + :type likelihood: GPy.likelihood.(Gaussian | EP | Laplace) + :param kernel: the kernel (covariance function). See link kernels + :type kernel: a GPy.kern.kern instance + :param X_variance: The uncertainty in the measurements of X (Gaussian variance) + :type X_variance: np.ndarray (num_data x input_dim) | None + :param Z: inducing inputs + :type Z: np.ndarray (num_inducing x input_dim) + :param num_inducing: Number of inducing points (optional, default 10. Ignored if Z is not None) + :type num_inducing: int + :param mpi_comm: The communication group of MPI, e.g. mpi4py.MPI.COMM_WORLD + :type mpi_comm: mpi4py.MPI.Intracomm + + """ + + def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False): + self._IN_OPTIMIZATION_ = False + if mpi_comm != None: + if inference_method is None: + inference_method = VarDTC_minibatch(mpi_comm=mpi_comm) + else: + assert isinstance(inference_method, VarDTC_minibatch), 'inference_method has to support MPI!' + + super(SparseGP_MPI, self).__init__(X, Y, Z, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer) + self.updates = False + self.add_parameter(self.X, index=0) + if variational_prior is not None: + self.add_parameter(variational_prior) + self.X.fix() + + self.mpi_comm = mpi_comm + # Manage the data (Y) division + if mpi_comm != None: + from ..util.mpi import divide_data + N_start, N_end, N_list = divide_data(Y.shape[0], mpi_comm) + self.N_range = (N_start, N_end) + self.N_list = np.array(N_list) + self.Y_local = self.Y[N_start:N_end] + print 'MPI RANK '+str(self.mpi_comm.rank)+' with the data range '+str(self.N_range) + mpi_comm.Bcast(self.param_array, root=0) + self.updates = True + + def __getstate__(self): + dc = super(SparseGP_MPI, self).__getstate__() + dc['mpi_comm'] = None + if self.mpi_comm != None: + del dc['N_range'] + del dc['N_list'] + del dc['Y_local'] + return dc + + #===================================================== + # The MPI parallelization + # - can move to model at some point + #===================================================== + + @SparseGP.optimizer_array.setter + def optimizer_array(self, p): + if self.mpi_comm != None: + if self._IN_OPTIMIZATION_ and self.mpi_comm.rank==0: + self.mpi_comm.Bcast(np.int32(1),root=0) + self.mpi_comm.Bcast(p, root=0) + + from ..util.debug import checkFinite + checkFinite(p, 'optimizer_array') + + SparseGP.optimizer_array.fset(self,p) + + def optimize(self, optimizer=None, start=None, **kwargs): + self._IN_OPTIMIZATION_ = True + if self.mpi_comm==None: + super(SparseGP_MPI, self).optimize(optimizer,start,**kwargs) + elif self.mpi_comm.rank==0: + super(SparseGP_MPI, self).optimize(optimizer,start,**kwargs) + self.mpi_comm.Bcast(np.int32(-1),root=0) + elif self.mpi_comm.rank>0: + x = self.optimizer_array.copy() + flag = np.empty(1,dtype=np.int32) + while True: + self.mpi_comm.Bcast(flag,root=0) + if flag==1: + self.optimizer_array = x + elif flag==-1: + break + else: + self._IN_OPTIMIZATION_ = False + raise Exception("Unrecognizable flag for synchronization!") + self._IN_OPTIMIZATION_ = False + + def parameters_changed(self): + if isinstance(self.inference_method,VarDTC_minibatch): + update_gradients(self, mpi_comm=self.mpi_comm) + else: + super(SparseGP_MPI,self).parameters_changed() + diff --git a/GPy/gpy_config.cfg b/GPy/defaults.cfg similarity index 51% rename from GPy/gpy_config.cfg rename to GPy/defaults.cfg index db90dbf6..50cc1107 100644 --- a/GPy/gpy_config.cfg +++ b/GPy/defaults.cfg @@ -1,11 +1,20 @@ -# This is the configuration file for GPy +# This is the default configuration file for GPy +# Do note edit this file. + +# For machine specific changes (i.e. those specific to a given installation) edit GPy/installation.cfg + +# For user specific changes edit $HOME/.gpy_user.cfg [parallel] # Enable openmp support. This speeds up some computations, depending on the number # of cores available. Setting up a compiler with openmp support can be difficult on -# some platforms, hence this option. +# some platforms, hence by default it is off. openmp=False +[datasets] +# location for the local data cache +dir=$HOME/tmp/GPy-datasets/ + [anaconda] # if you have an anaconda python installation please specify it here. installed = False diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py index 2b0a201d..ae9d8eb8 100644 --- a/GPy/examples/classification.py +++ b/GPy/examples/classification.py @@ -96,15 +96,11 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot= # Optimize if optimize: - #m.update_likelihood_approximation() - # Parameters optimization: try: m.optimize('scg', messages=1) except Exception as e: return m - #m.pseudo_EM() - # Plot if plot: fig, axes = pb.subplots(2, 1) @@ -133,10 +129,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti # Optimize if optimize: - #m.update_likelihood_approximation() - # Parameters optimization: - #m.optimize() - m.pseudo_EM() + m.optimize() # Plot if plot: diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py index 8a31968e..842d0bf8 100644 --- a/GPy/examples/dimensionality_reduction.py +++ b/GPy/examples/dimensionality_reduction.py @@ -37,7 +37,7 @@ def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan # k = GPy.kern.RBF(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True) p = .3 - + m = GPy.models.BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing) if nan: @@ -99,7 +99,7 @@ def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_induci m.kern.plot_ARD() return m -def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4, sigma=.2): +def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=25, Q=4, sigma=.2): import GPy from GPy.util.datasets import swiss_roll_generated from GPy.models import BayesianGPLVM @@ -144,16 +144,15 @@ def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4 m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel) m.data_colors = c m.data_t = t - m['noise_variance'] = Y.var() / 100. if optimize: - m.optimize('scg', messages=verbose, max_iters=2e3) + m.optimize('bfgs', messages=verbose, max_iters=2e3) if plot: fig = plt.figure('fitted') ax = fig.add_subplot(111) s = m.input_sensitivity().argsort()[::-1][:2] - ax.scatter(*m.X.T[s], c=c) + ax.scatter(*m.X.mean.T[s], c=c) return m @@ -161,6 +160,7 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, import GPy from matplotlib import pyplot as plt from ..util.misc import param_to_array + import numpy as np _np.random.seed(0) data = GPy.util.datasets.oil() @@ -169,24 +169,50 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, Y = data['X'][:N] m = GPy.models.BayesianGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k) m.data_labels = data['Y'][:N].argmax(axis=1) - + if optimize: - m.optimize('scg', messages=verbose, max_iters=max_iters, gtol=.05) + m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05) if plot: - y = m.Y fig, (latent_axes, sense_axes) = plt.subplots(1, 2) m.plot_latent(ax=latent_axes, labels=m.data_labels) - data_show = GPy.plotting.matplot_dep.visualize.vector_show(y) - lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(param_to_array(m.X.mean), # @UnusedVariable - m, data_show, latent_axes=latent_axes, sense_axes=sense_axes) + data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0,:])) + lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(param_to_array(m.X.mean)[0:1,:], # @UnusedVariable + m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels) + raw_input('Press enter to finish') + plt.close(fig) + return m + +def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k): + import GPy + from matplotlib import pyplot as plt + from ..util.misc import param_to_array + import numpy as np + + _np.random.seed(0) + data = GPy.util.datasets.oil() + + kernel = GPy.kern.RBF(Q, 1., 1./_np.random.uniform(0,1,(Q,)), ARD=True)# + GPy.kern.Bias(Q, _np.exp(-2)) + Y = data['X'][:N] + m = GPy.models.SSGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k) + m.data_labels = data['Y'][:N].argmax(axis=1) + + if optimize: + m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05) + + if plot: + fig, (latent_axes, sense_axes) = plt.subplots(1, 2) + m.plot_latent(ax=latent_axes, labels=m.data_labels) + data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0,:])) + lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(param_to_array(m.X.mean)[0:1,:], # @UnusedVariable + m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels) raw_input('Press enter to finish') plt.close(fig) return m def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False): _np.random.seed(1234) - + x = _np.linspace(0, 4 * _np.pi, N)[:, None] s1 = _np.vectorize(lambda x: _np.sin(x)) s2 = _np.vectorize(lambda x: _np.cos(x)**2) @@ -289,6 +315,31 @@ def bgplvm_simulation(optimize=True, verbose=1, m.kern.plot_ARD('BGPLVM Simulation ARD Parameters') return m +def ssgplvm_simulation(optimize=True, verbose=1, + plot=True, plot_sim=False, + max_iters=2e4, useGPU=False + ): + from GPy import kern + from GPy.models import SSGPLVM + + D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 3, 9 + _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim) + Y = Ylist[0] + k = kern.Linear(Q, ARD=True, useGPU=useGPU)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q) + #k = kern.RBF(Q, ARD=True, lengthscale=10.) + m = SSGPLVM(Y, Q, init="pca", num_inducing=num_inducing, kernel=k) + m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape) + m.likelihood.variance = .1 + + if optimize: + print "Optimizing model:" + m.optimize('scg', messages=verbose, max_iters=max_iters, + gtol=.05) + if plot: + m.X.plot("SSGPLVM Latent Space 1D") + m.kern.plot_ARD('SSGPLVM Simulation ARD Parameters') + return m + def bgplvm_simulation_missing_data(optimize=True, verbose=1, plot=True, plot_sim=False, max_iters=2e4, @@ -297,15 +348,18 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1, from GPy.models import BayesianGPLVM from GPy.inference.latent_function_inference.var_dtc import VarDTCMissingData - D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 7, 9 + D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4 _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim) Y = Ylist[0] k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q) - inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool) - m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing, kernel=k) - m.inference_method = VarDTCMissingData() - m.Y[inan] = _np.nan + inan = _np.random.binomial(1, .8, size=Y.shape).astype(bool) # 80% missing data + Ymissing = Y.copy() + Ymissing[inan] = _np.nan + + m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing, + inference_method=VarDTCMissingData(inan=inan), kernel=k) + m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape) m.likelihood.variance = .01 m.parameters_changed() @@ -338,7 +392,40 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw): print "Optimizing Model:" m.optimize(messages=verbose, max_iters=8e3, gtol=.1) if plot: - m.plot_X_1d("MRD Latent Space 1D") + m.X.plot("MRD Latent Space 1D") + m.plot_scales("MRD Scales") + return m + +def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim=True, **kw): + from GPy import kern + from GPy.models import MRD + from GPy.inference.latent_function_inference.var_dtc import VarDTCMissingData + + D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5 + _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim) + + #Ylist = [Ylist[0]] + k = kern.Linear(Q, ARD=True) + inanlist = [] + + for Y in Ylist: + inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool) + inanlist.append(inan) + Y[inan] = _np.nan + + imlist = [] + for inan in inanlist: + imlist.append(VarDTCMissingData(limit=1, inan=inan)) + + m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, + kernel=k, inference_method=imlist, + initx="random", initz='permute', **kw) + + if optimize: + print "Optimizing Model:" + m.optimize('bfgs', messages=verbose, max_iters=8e3, gtol=.1) + if plot: + m.X.plot("MRD Latent Space 1D") m.plot_scales("MRD Scales") return m @@ -351,18 +438,17 @@ def brendan_faces(optimize=True, verbose=True, plot=True): Yn = Y - Y.mean() Yn /= Yn.std() - m = GPy.models.GPLVM(Yn, Q) + m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=20) # optimize - m.constrain('rbf|noise|white', GPy.transformations.LogexpClipped()) - if optimize: m.optimize('scg', messages=verbose, max_iters=1000) + if optimize: m.optimize('bfgs', messages=verbose, max_iters=1000) if plot: ax = m.plot_latent(which_indices=(0, 1)) - y = m.likelihood.Y[0, :] + y = m.Y[0, :] data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False) - GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax) + lvm = GPy.plotting.matplot_dep.visualize.lvm(m.X.mean[0, :].copy(), m, data_show, ax) raw_input('Press enter to finish') return m @@ -376,13 +462,14 @@ def olivetti_faces(optimize=True, verbose=True, plot=True): Yn = Y - Y.mean() Yn /= Yn.std() - m = GPy.models.GPLVM(Yn, Q) - if optimize: m.optimize('scg', messages=verbose, max_iters=1000) + m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=20) + + if optimize: m.optimize('bfgs', messages=verbose, max_iters=1000) if plot: ax = m.plot_latent(which_indices=(0, 1)) - y = m.likelihood.Y[0, :] + y = m.Y[0, :] data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False) - GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax) + lvm = GPy.plotting.matplot_dep.visualize.lvm(m.X.mean[0, :].copy(), m, data_show, ax) raw_input('Press enter to finish') return m @@ -408,15 +495,16 @@ def stick(kernel=None, optimize=True, verbose=True, plot=True): data = GPy.util.datasets.osu_run1() # optimize m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel) - if optimize: m.optimize(messages=verbose, max_f_eval=10000) + if optimize: m.optimize('bfgs', messages=verbose, max_f_eval=10000) if plot: plt.clf ax = m.plot_latent() y = m.Y[0, :] data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect']) - vis = GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, latent_axes=ax) + lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[:1, :].copy(), m, data_show, latent_axes=ax) raw_input('Press enter to finish') - + lvm_visualizer.close() + data_show.close() return m def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True): @@ -464,9 +552,8 @@ def robot_wireless(optimize=True, verbose=True, plot=True): data = GPy.util.datasets.robot_wireless() # optimize - m = GPy.models.GPLVM(data['Y'], 2) + m = GPy.models.BayesianGPLVM(data['Y'], 4, num_inducing=25) if optimize: m.optimize(messages=verbose, max_f_eval=10000) - m._set_params(m._get_params()) if plot: m.plot_latent() @@ -475,23 +562,32 @@ def robot_wireless(optimize=True, verbose=True, plot=True): def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True): from GPy.models import BayesianGPLVM from matplotlib import pyplot as plt + import numpy as np import GPy data = GPy.util.datasets.osu_run1() Q = 6 - kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.Bias(Q, _np.exp(-2)) + GPy.kern.White(Q, _np.exp(-2)) + kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(.5, Q), ARD=True) m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel) + + m.data = data + m.likelihood.variance = 0.001 + # optimize - m.ensure_default_constraints() - if optimize: m.optimize('scg', messages=verbose, max_iters=200, xtol=1e-300, ftol=1e-300) - m._set_params(m._get_params()) + try: + if optimize: m.optimize('bfgs', messages=verbose, max_iters=5e3, bfgs_factor=10) + except KeyboardInterrupt: + print "Keyboard interrupt, continuing to plot and return" + if plot: - plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2) + fig, (latent_axes, sense_axes) = plt.subplots(1, 2) plt.sca(latent_axes) - m.plot_latent() - y = m.likelihood.Y[0, :].copy() - data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect']) - GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes) + m.plot_latent(ax=latent_axes) + y = m.Y[:1, :].copy() + data_show = GPy.plotting.matplot_dep.visualize.stick_show(y, connect=data['connect']) + dim_select = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean[:1, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes) + fig.canvas.draw() + fig.canvas.show() raw_input('Press enter to finish') return m @@ -509,11 +605,12 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose if optimize: m.optimize(messages=verbose, max_f_eval=10000) if plot: ax = m.plot_latent() - y = m.likelihood.Y[0, :] + y = m.Y[0, :] data_show = GPy.plotting.matplot_dep.visualize.skeleton_show(y[None, :], data['skel']) - lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax) + lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0].copy(), m, data_show, latent_axes=ax) raw_input('Press enter to finish') lvm_visualizer.close() + data_show.close() return m @@ -522,7 +619,7 @@ def ssgplvm_simulation_linear(): import GPy N, D, Q = 1000, 20, 5 pi = 0.2 - + def sample_X(Q, pi): x = np.empty(Q) dies = np.random.rand(Q) @@ -532,7 +629,7 @@ def ssgplvm_simulation_linear(): else: x[q] = 0. return x - + Y = np.empty((N,D)) X = np.empty((N,Q)) # Generate data from random sampled weight matrices @@ -540,4 +637,4 @@ def ssgplvm_simulation_linear(): X[n] = sample_X(Q,pi) w = np.random.randn(D,Q) Y[n] = np.dot(w,X[n]) - + diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py index 2a4b91b3..c4465061 100644 --- a/GPy/examples/regression.py +++ b/GPy/examples/regression.py @@ -387,7 +387,7 @@ def silhouette(max_iters=100, optimize=True, plot=True): print m return m -def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True): +def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True, checkgrad=True): """Run a 1D example of a sparse GP regression.""" # sample inputs and outputs X = np.random.uniform(-3., 3., (num_samples, 1)) @@ -396,7 +396,9 @@ def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, opti rbf = GPy.kern.RBF(1) # create simple GP Model m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing) - m.checkgrad(verbose=1) + + if checkgrad: + m.checkgrad(verbose=1) if optimize: m.optimize('tnc', messages=1, max_iters=max_iters) diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py index 68004a08..8589a60e 100644 --- a/GPy/inference/latent_function_inference/__init__.py +++ b/GPy/inference/latent_function_inference/__init__.py @@ -25,6 +25,39 @@ etc. """ +class LatentFunctionInference(object): + def on_optimization_start(self): + """ + This function gets called, just before the optimization loop to start. + """ + pass + + def on_optimization_end(self): + """ + This function gets called, just after the optimization loop ended. + """ + pass + +class InferenceMethodList(LatentFunctionInference, list): + + def on_optimization_start(self): + for inf in self: + inf.on_optimization_start() + + def on_optimization_end(self): + for inf in self: + inf.on_optimization_end() + + def __getstate__(self): + state = [] + for inf in self: + state.append(inf) + return state + + def __setstate__(self, state): + for inf in state: + self.append(inf) + from exact_gaussian_inference import ExactGaussianInference from laplace import Laplace from GPy.inference.latent_function_inference.var_dtc import VarDTC @@ -38,11 +71,26 @@ from var_dtc_gpu import VarDTC_GPU # class FullLatentFunctionData(object): # # -# class LatentFunctionInference(object): -# def inference(self, kern, X, likelihood, Y, Y_metadata=None): + +# class EMLikeLatentFunctionInference(LatentFunctionInference): +# def update_approximation(self): +# """ +# This function gets called when the +# """ +# +# def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None): # """ # Do inference on the latent functions given a covariance function `kern`, -# inputs and outputs `X` and `Y`, and a likelihood `likelihood`. +# inputs and outputs `X` and `Y`, inducing_inputs `Z`, and a likelihood `likelihood`. +# Additional metadata for the outputs `Y` can be given in `Y_metadata`. +# """ +# raise NotImplementedError, "Abstract base class for full inference" +# +# class VariationalLatentFunctionInference(LatentFunctionInference): +# def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None): +# """ +# Do inference on the latent functions given a covariance function `kern`, +# inputs and outputs `X` and `Y`, inducing_inputs `Z`, and a likelihood `likelihood`. # Additional metadata for the outputs `Y` can be given in `Y_metadata`. # """ # raise NotImplementedError, "Abstract base class for full inference" diff --git a/GPy/inference/latent_function_inference/dtc.py b/GPy/inference/latent_function_inference/dtc.py index 1a84da6b..1b6b1dbd 100644 --- a/GPy/inference/latent_function_inference/dtc.py +++ b/GPy/inference/latent_function_inference/dtc.py @@ -4,9 +4,10 @@ from posterior import Posterior from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv import numpy as np +from . import LatentFunctionInference log_2_pi = np.log(2*np.pi) -class DTC(object): +class DTC(LatentFunctionInference): """ An object for inference when the likelihood is Gaussian, but we want to do sparse inference. diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py index c0177e9f..0c02efe3 100644 --- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py +++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py @@ -5,10 +5,11 @@ from posterior import Posterior from ...util.linalg import pdinv, dpotrs, tdot from ...util import diag import numpy as np +from . import LatentFunctionInference log_2_pi = np.log(2*np.pi) -class ExactGaussianInference(object): +class ExactGaussianInference(LatentFunctionInference): """ An object for inference when the likelihood is Gaussian. diff --git a/GPy/inference/latent_function_inference/expectation_propagation.py b/GPy/inference/latent_function_inference/expectation_propagation.py index 172f43fb..1afc8100 100644 --- a/GPy/inference/latent_function_inference/expectation_propagation.py +++ b/GPy/inference/latent_function_inference/expectation_propagation.py @@ -1,9 +1,10 @@ import numpy as np from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs from posterior import Posterior +from . import LatentFunctionInference log_2_pi = np.log(2*np.pi) -class EP(object): +class EP(LatentFunctionInference): def __init__(self, epsilon=1e-6, eta=1., delta=1.): """ The expectation-propagation algorithm. @@ -21,14 +22,25 @@ class EP(object): def reset(self): self.old_mutilde, self.old_vtilde = None, None + self._ep_approximation = None + + def on_optimization_start(self): + self._ep_approximation = None + + def on_optimization_end(self): + # TODO: update approximation in the end as well? Maybe even with a switch? + pass def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None): - num_data, output_dim = X.shape + num_data, output_dim = Y.shape assert output_dim ==1, "ep in 1D only (for now!)" K = kern.K(X) - mu, Sigma, mu_tilde, tau_tilde, Z_hat = self.expectation_propagation(K, Y, likelihood, Y_metadata) + if self._ep_approximation is None: + mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata) + else: + mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde)) @@ -42,8 +54,6 @@ class EP(object): return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL} - - def expectation_propagation(self, K, Y, likelihood, Y_metadata): num_data, data_dim = Y.shape @@ -108,4 +118,3 @@ class EP(object): mu_tilde = v_tilde/tau_tilde return mu, Sigma, mu_tilde, tau_tilde, Z_hat - diff --git a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py index 3625a5bf..3aeb4fbb 100644 --- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py +++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py @@ -1,39 +1,192 @@ import numpy as np -from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs -from expectation_propagation import EP +from ...util import diag +from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify, DSYR +from ...util.misc import param_to_array +from ...core.parameterization.variational import VariationalPosterior +from . import LatentFunctionInference from posterior import Posterior log_2_pi = np.log(2*np.pi) -class EPDTC(EP): - #def __init__(self, epsilon=1e-6, eta=1., delta=1.): +class EPDTC(LatentFunctionInference): + const_jitter = 1e-6 + def __init__(self, epsilon=1e-6, eta=1., delta=1., limit=1): + from ...util.caching import Cacher + self.limit = limit + self.get_trYYT = Cacher(self._get_trYYT, limit) + self.get_YYTfactor = Cacher(self._get_YYTfactor, limit) + + self.epsilon, self.eta, self.delta = epsilon, eta, delta + self.reset() + + def set_limit(self, limit): + self.get_trYYT.limit = limit + self.get_YYTfactor.limit = limit + + def _get_trYYT(self, Y): + return param_to_array(np.sum(np.square(Y))) + + def __getstate__(self): + # has to be overridden, as Cacher objects cannot be pickled. + return self.limit + + def __setstate__(self, state): + # has to be overridden, as Cacher objects cannot be pickled. + self.limit = state + from ...util.caching import Cacher + self.get_trYYT = Cacher(self._get_trYYT, self.limit) + self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit) + + def _get_YYTfactor(self, Y): + """ + find a matrix L which satisfies LLT = YYT. + + Note that L may have fewer columns than Y. + """ + N, D = Y.shape + if (N>=D): + return param_to_array(Y) + else: + return jitchol(tdot(Y)) + + def get_VVTfactor(self, Y, prec): + return Y * prec # TODO chache this, and make it effective + + def reset(self): + self.old_mutilde, self.old_vtilde = None, None + self._ep_approximation = None def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None): - num_data, output_dim = X.shape + num_data, output_dim = Y.shape assert output_dim ==1, "ep in 1D only (for now!)" Kmm = kern.K(Z) Kmn = kern.K(Z,X) + if self._ep_approximation is None: + mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata) + else: + mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation + + + if isinstance(X, VariationalPosterior): + uncertain_inputs = True + psi0 = kern.psi0(Z, X) + psi1 = Kmn.T#kern.psi1(Z, X) + psi2 = kern.psi2(Z, X) + else: + uncertain_inputs = False + psi0 = kern.Kdiag(X) + psi1 = Kmn.T#kern.K(X, Z) + psi2 = None + + #see whether we're using variational uncertain inputs + + _, output_dim = Y.shape + + #see whether we've got a different noise variance for each datum + #beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6) + beta = tau_tilde + VVT_factor = beta[:,None]*mu_tilde[:,None] + trYYT = self.get_trYYT(mu_tilde[:,None]) + + # do the inference: + het_noise = beta.size > 1 + num_inducing = Z.shape[0] + num_data = Y.shape[0] + # kernel computations, using BGPLVM notation + + Kmm = kern.K(Z).copy() + diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) - Lmi = dtrtrs(Lm,np.eye(Lm.shape[0]))[0] - Kmmi = np.dot(Lmi.T,Lmi) - KmmiKmn = np.dot(Kmmi,Kmn) - K = np.dot(Kmn.T,KmmiKmn) + + # The rather complex computations of A + if uncertain_inputs: + if het_noise: + psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0) + else: + psi2_beta = psi2.sum(0) * beta + LmInv = dtrtri(Lm) + A = LmInv.dot(psi2_beta.dot(LmInv.T)) + else: + if het_noise: + tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1))) + else: + tmp = psi1 * (np.sqrt(beta)) + tmp, _ = dtrtrs(Lm, tmp.T, lower=1) + A = tdot(tmp) #print A.sum() + + # factor B + B = np.eye(num_inducing) + A + LB = jitchol(B) + psi1Vf = np.dot(psi1.T, VVT_factor) + # back substutue C into psi1Vf + tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0) + _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0) + tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1) + Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1) + + # data fit and derivative of L w.r.t. Kmm + delit = tdot(_LBi_Lmi_psi1Vf) + data_fit = np.trace(delit) + DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit) + delit = -0.5 * DBi_plus_BiPBi + delit += -0.5 * B * output_dim + delit += output_dim * np.eye(num_inducing) + # Compute dL_dKmm + dL_dKmm = backsub_both_sides(Lm, delit) + + # derivatives of L w.r.t. psi + dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, + VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, + psi1, het_noise, uncertain_inputs) + + # log marginal likelihood + log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, + psi0, A, LB, trYYT, data_fit, VVT_factor) + + #put the gradients in the right places + dL_dR = _compute_dL_dR(likelihood, + het_noise, uncertain_inputs, LB, + _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, + psi0, psi1, beta, + data_fit, num_data, output_dim, trYYT, mu_tilde[:,None]) + + dL_dthetaL = 0#likelihood.exact_inference_gradients(dL_dR,Y_metadata) + + if uncertain_inputs: + grad_dict = {'dL_dKmm': dL_dKmm, + 'dL_dpsi0':dL_dpsi0, + 'dL_dpsi1':dL_dpsi1, + 'dL_dpsi2':dL_dpsi2, + 'dL_dthetaL':dL_dthetaL} + else: + grad_dict = {'dL_dKmm': dL_dKmm, + 'dL_dKdiag':dL_dpsi0, + 'dL_dKnm':dL_dpsi1, + 'dL_dthetaL':dL_dthetaL} + + #get sufficient things for posterior prediction + #TODO: do we really want to do this in the loop? + if VVT_factor.shape[1] == Y.shape[1]: + woodbury_vector = Cpsi1Vf # == Cpsi1V + else: + print 'foobar' + psi1V = np.dot(mu_tilde[:,None].T*beta, psi1).T + tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0) + tmp, _ = dpotrs(LB, tmp, lower=1) + woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1) + Bi, _ = dpotri(LB, lower=1) + symmetrify(Bi) + Bi = -dpotri(LB, lower=1)[0] + diag.add(Bi, 1) + + woodbury_inv = backsub_both_sides(Lm, Bi) + + #construct a posterior object + post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm) + return post, log_marginal, grad_dict - mu, Sigma, mu_tilde, tau_tilde, Z_hat = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata) - - Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde)) - - alpha, _ = dpotrs(LW, mu_tilde, lower=1) - - log_marginal = 0.5*(-num_data * log_2_pi - W_logdet - np.sum(alpha * mu_tilde)) # TODO: add log Z_hat?? - - dL_dK = 0.5 * (tdot(alpha[:,None]) - Wi) - - dL_dthetaL = np.zeros(likelihood.size)#TODO: derivatives of the likelihood parameters - - return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL} @@ -121,3 +274,69 @@ class EPDTC(EP): mu_tilde = v_tilde/tau_tilde return mu, Sigma, mu_tilde, tau_tilde, Z_hat + +def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs): + dL_dpsi0 = -0.5 * output_dim * (beta[:,None] * np.ones([num_data, 1])).flatten() + dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T) + dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi) + if het_noise: + if uncertain_inputs: + dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :] + else: + dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta.reshape(num_data, 1)).T).T + dL_dpsi2 = None + else: + dL_dpsi2 = beta * dL_dpsi2_beta + if uncertain_inputs: + # repeat for each of the N psi_2 matrices + dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0) + else: + # subsume back into psi1 (==Kmn) + dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2) + dL_dpsi2 = None + + return dL_dpsi0, dL_dpsi1, dL_dpsi2 + + +def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT, Y): + # the partial derivative vector for the likelihood + if likelihood.size == 0: + # save computation here. + dL_dR = None + elif het_noise: + if uncertain_inputs: + raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented" + else: + #from ...util.linalg import chol_inv + #LBi = chol_inv(LB) + LBi, _ = dtrtrs(LB,np.eye(LB.shape[0])) + + Lmi_psi1, nil = dtrtrs(Lm, psi1.T, lower=1, trans=0) + _LBi_Lmi_psi1, _ = dtrtrs(LB, Lmi_psi1, lower=1, trans=0) + + dL_dR = -0.5 * beta + 0.5 * (beta*Y)**2 + dL_dR += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2 + + dL_dR += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2 + + dL_dR += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * Y * beta**2 + dL_dR += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2 + else: + # likelihood is not heteroscedatic + dL_dR = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2 + dL_dR += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta) + dL_dR += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit) + return dL_dR + +def _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT, data_fit,Y): + #compute log marginal likelihood + if het_noise: + lik_1 = -0.5 * num_data * output_dim * np.log(2. * np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(beta * np.square(Y).sum(axis=-1)) + lik_2 = -0.5 * output_dim * (np.sum(beta.flatten() * psi0) - np.trace(A)) + else: + lik_1 = -0.5 * num_data * output_dim * (np.log(2. * np.pi) - np.log(beta)) - 0.5 * beta * trYYT + lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A)) + lik_3 = -output_dim * (np.sum(np.log(np.diag(LB)))) + lik_4 = 0.5 * data_fit + log_marginal = lik_1 + lik_2 + lik_3 + lik_4 + return log_marginal diff --git a/GPy/inference/latent_function_inference/fitc.py b/GPy/inference/latent_function_inference/fitc.py index de47e5d5..a184c6c4 100644 --- a/GPy/inference/latent_function_inference/fitc.py +++ b/GPy/inference/latent_function_inference/fitc.py @@ -5,9 +5,10 @@ from posterior import Posterior from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv from ...util import diag import numpy as np +from . import LatentFunctionInference log_2_pi = np.log(2*np.pi) -class FITC(object): +class FITC(LatentFunctionInference): """ An object for inference when the likelihood is Gaussian, but we want to do sparse inference. diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py index 9ba3f83f..1c153518 100644 --- a/GPy/inference/latent_function_inference/laplace.py +++ b/GPy/inference/latent_function_inference/laplace.py @@ -16,8 +16,9 @@ from ...util.misc import param_to_array from posterior import Posterior import warnings from scipy import optimize +from . import LatentFunctionInference -class Laplace(object): +class Laplace(LatentFunctionInference): def __init__(self): """ diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py index 309cb7e5..66c68261 100644 --- a/GPy/inference/latent_function_inference/posterior.py +++ b/GPy/inference/latent_function_inference/posterior.py @@ -95,7 +95,7 @@ class Posterior(object): """ if self._covariance is None: #LiK, _ = dtrtrs(self.woodbury_chol, self._K, lower=1) - self._covariance = self._K - (np.tensordot(np.dot(np.atleast_3d(self.woodbury_inv).T, self._K), self._K, [1,0]).T).squeeze() + self._covariance = (np.atleast_3d(self._K) - np.tensordot(np.dot(np.atleast_3d(self.woodbury_inv).T, self._K), self._K, [1,0]).T).squeeze() #self._covariance = self._K - self._K.dot(self.woodbury_inv).dot(self._K) return self._covariance @@ -153,9 +153,14 @@ class Posterior(object): $$ """ if self._woodbury_inv is None: - self._woodbury_inv, _ = dpotri(self.woodbury_chol, lower=1) - #self._woodbury_inv, _ = dpotrs(self.woodbury_chol, np.eye(self.woodbury_chol.shape[0]), lower=1) - symmetrify(self._woodbury_inv) + if self._woodbury_chol is not None: + self._woodbury_inv, _ = dpotri(self._woodbury_chol, lower=1) + #self._woodbury_inv, _ = dpotrs(self.woodbury_chol, np.eye(self.woodbury_chol.shape[0]), lower=1) + symmetrify(self._woodbury_inv) + elif self._covariance is not None: + B = self._K - self._covariance + tmp, _ = dpotrs(self.K_chol, B) + self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T) return self._woodbury_inv @property diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py index 0cc841ed..4f21bc29 100644 --- a/GPy/inference/latent_function_inference/var_dtc.py +++ b/GPy/inference/latent_function_inference/var_dtc.py @@ -7,9 +7,12 @@ from ...util import diag from ...core.parameterization.variational import VariationalPosterior import numpy as np from ...util.misc import param_to_array +from . import LatentFunctionInference log_2_pi = np.log(2*np.pi) +import logging, itertools +logger = logging.getLogger('vardtc') -class VarDTC(object): +class VarDTC(LatentFunctionInference): """ An object for inference when the likelihood is Gaussian, but we want to do sparse inference. @@ -35,11 +38,11 @@ class VarDTC(object): return param_to_array(np.sum(np.square(Y))) def __getstate__(self): - # has to be overridden, as Cacher objects cannot be pickled. + # has to be overridden, as Cacher objects cannot be pickled. return self.limit def __setstate__(self, state): - # has to be overridden, as Cacher objects cannot be pickled. + # has to be overridden, as Cacher objects cannot be pickled. self.limit = state from ...util.caching import Cacher self.get_trYYT = Cacher(self._get_trYYT, self.limit) @@ -61,20 +64,9 @@ class VarDTC(object): return Y * prec # TODO chache this, and make it effective def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None): - if isinstance(X, VariationalPosterior): - uncertain_inputs = True - psi0 = kern.psi0(Z, X) - psi1 = kern.psi1(Z, X) - psi2 = kern.psi2(Z, X) - else: - uncertain_inputs = False - psi0 = kern.Kdiag(X) - psi1 = kern.K(X, Z) - psi2 = None - - #see whether we're using variational uncertain inputs _, output_dim = Y.shape + uncertain_inputs = isinstance(X, VariationalPosterior) #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6) @@ -95,23 +87,21 @@ class VarDTC(object): diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) - # The rather complex computations of A + + # The rather complex computations of A, and the psi stats if uncertain_inputs: + psi0 = kern.psi0(Z, X) + psi1 = kern.psi1(Z, X) if het_noise: - psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0) + psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0) else: - psi2_beta = psi2.sum(0) * beta - #if 0: - # evals, evecs = linalg.eigh(psi2_beta) - # clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable - # if not np.array_equal(evals, clipped_evals): - # pass # print evals - # tmp = evecs * np.sqrt(clipped_evals) - # tmp = tmp.T - # no backsubstitution because of bound explosion on tr(A) if not... + psi2_beta = kern.psi2(Z,X) * beta LmInv = dtrtri(Lm) A = LmInv.dot(psi2_beta.dot(LmInv.T)) else: + psi0 = kern.Kdiag(X) + psi1 = kern.K(X, Z) + psi2 = None if het_noise: tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1))) else: @@ -148,7 +138,7 @@ class VarDTC(object): log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT, data_fit, VVT_factor) - #put the gradients in the right places + #noise derivatives dL_dR = _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, @@ -157,6 +147,7 @@ class VarDTC(object): dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata) + #put the gradients in the right places if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, @@ -190,36 +181,62 @@ class VarDTC(object): post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm) return post, log_marginal, grad_dict -class VarDTCMissingData(object): - const_jitter = 1e-6 +class VarDTCMissingData(LatentFunctionInference): + const_jitter = 1e-10 def __init__(self, limit=1, inan=None): from ...util.caching import Cacher self._Y = Cacher(self._subarray_computations, limit) - self._inan = inan + if inan is not None: self._inan = ~inan + else: self._inan = None pass def set_limit(self, limit): self._Y.limit = limit + def __getstate__(self): + # has to be overridden, as Cacher objects cannot be pickled. + return self._Y.limit, self._inan + + def __setstate__(self, state): + # has to be overridden, as Cacher objects cannot be pickled. + from ...util.caching import Cacher + self.limit = state[0] + self._inan = state[1] + self._Y = Cacher(self._subarray_computations, self.limit) + def _subarray_computations(self, Y): if self._inan is None: inan = np.isnan(Y) has_none = inan.any() + self._inan = ~inan else: inan = self._inan has_none = True if has_none: - from ...util.subarray_and_sorting import common_subarrays - self._subarray_indices = [] - for v,ind in common_subarrays(inan, 1).iteritems(): - if not np.all(v): - v = ~np.array(v, dtype=bool) - ind = np.array(ind, dtype=int) - if ind.size == Y.shape[1]: - ind = slice(None) - self._subarray_indices.append([v,ind]) - Ys = [Y[v, :][:, ind] for v, ind in self._subarray_indices] - traces = [(y**2).sum() for y in Ys] + #print "caching missing data slices, this can take several minutes depending on the number of unique dimensions of the data..." + #csa = common_subarrays(inan, 1) + size = Y.shape[1] + #logger.info('preparing subarrays {:3.3%}'.format((i+1.)/size)) + Ys = [] + next_ten = [0.] + count = itertools.count() + for v, y in itertools.izip(inan.T, Y.T[:,:,None]): + i = count.next() + if ((i+1.)/size) >= next_ten[0]: + logger.info('preparing subarrays {:>6.1%}'.format((i+1.)/size)) + next_ten[0] += .1 + Ys.append(y[v,:]) + + next_ten = [0.] + count = itertools.count() + def trace(y): + i = count.next() + if ((i+1.)/size) >= next_ten[0]: + logger.info('preparing traces {:>6.1%}'.format((i+1.)/size)) + next_ten[0] += .1 + y = y[inan[:,i],i:i+1] + return np.einsum('ij,ij->', y,y) + traces = [trace(Y) for _ in xrange(size)] return Ys, traces else: self._subarray_indices = [[slice(None),slice(None)]] @@ -241,7 +258,6 @@ class VarDTCMissingData(object): beta_all = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6) het_noise = beta_all.size != 1 - import itertools num_inducing = Z.shape[0] dL_dpsi0_all = np.zeros(Y.shape[0]) @@ -261,18 +277,17 @@ class VarDTCMissingData(object): Lm = jitchol(Kmm) if uncertain_inputs: LmInv = dtrtri(Lm) - VVT_factor_all = np.empty(Y.shape) - full_VVT_factor = VVT_factor_all.shape[1] == Y.shape[1] - if not full_VVT_factor: - psi1V = np.dot(Y.T*beta_all, psi1_all).T - - for y, trYYT, [v, ind] in itertools.izip(Ys, traces, self._subarray_indices): - if het_noise: beta = beta_all[ind] + size = Y.shape[1] + next_ten = 0 + for i, [y, v, trYYT] in enumerate(itertools.izip(Ys, self._inan.T, traces)): + if ((i+1.)/size) >= next_ten: + logger.info('inference {:> 6.1%}'.format((i+1.)/size)) + next_ten += .1 + if het_noise: beta = beta_all[i] else: beta = beta_all - VVT_factor = (beta*y) - VVT_factor_all[v, ind].flat = VVT_factor.flat - output_dim = y.shape[1] + VVT_factor = (y*beta) + output_dim = 1#len(ind) psi0 = psi0_all[v] psi1 = psi1_all[v, :] @@ -314,7 +329,6 @@ class VarDTCMissingData(object): VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs) - #import ipdb;ipdb.set_trace() dL_dpsi0_all[v] += dL_dpsi0 dL_dpsi1_all[v, :] += dL_dpsi1 if uncertain_inputs: @@ -331,19 +345,20 @@ class VarDTCMissingData(object): psi0, psi1, beta, data_fit, num_data, output_dim, trYYT, Y) - if full_VVT_factor: woodbury_vector[:, ind] = Cpsi1Vf - else: - print 'foobar' - tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0) - tmp, _ = dpotrs(LB, tmp, lower=1) - woodbury_vector[:, ind] = dtrtrs(Lm, tmp, lower=1, trans=1)[0] + #if full_VVT_factor: + woodbury_vector[:, i:i+1] = Cpsi1Vf + #else: + # print 'foobar' + # tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0) + # tmp, _ = dpotrs(LB, tmp, lower=1) + # woodbury_vector[:, ind] = dtrtrs(Lm, tmp, lower=1, trans=1)[0] #import ipdb;ipdb.set_trace() Bi, _ = dpotri(LB, lower=1) symmetrify(Bi) Bi = -dpotri(LB, lower=1)[0] diag.add(Bi, 1) - woodbury_inv_all[:, :, ind] = backsub_both_sides(Lm, Bi)[:,:,None] + woodbury_inv_all[:, :, i:i+1] = backsub_both_sides(Lm, Bi)[:,:,None] dL_dthetaL = likelihood.exact_inference_gradients(dL_dR) @@ -360,23 +375,6 @@ class VarDTCMissingData(object): 'dL_dKnm':dL_dpsi1_all, 'dL_dthetaL':dL_dthetaL} - #get sufficient things for posterior prediction - #TODO: do we really want to do this in the loop? - #if not full_VVT_factor: - # print 'foobar' - # psi1V = np.dot(Y.T*beta_all, psi1_all).T - # tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0) - # tmp, _ = dpotrs(LB_all, tmp, lower=1) - # woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1) - #import ipdb;ipdb.set_trace() - #Bi, _ = dpotri(LB_all, lower=1) - #symmetrify(Bi) - #Bi = -dpotri(LB_all, lower=1)[0] - #from ...util import diag - #diag.add(Bi, 1) - - #woodbury_inv = backsub_both_sides(Lm, Bi) - post = Posterior(woodbury_inv=woodbury_inv_all, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm) return post, log_marginal, grad_dict @@ -393,10 +391,7 @@ def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, C dL_dpsi2 = None else: dL_dpsi2 = beta * dL_dpsi2_beta - if uncertain_inputs: - # repeat for each of the N psi_2 matrices - dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0) - else: + if not uncertain_inputs: # subsume back into psi1 (==Kmn) dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2) dL_dpsi2 = None diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index 9b2da1c9..3bd5c347 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -7,6 +7,7 @@ from ...util import diag from ...core.parameterization.variational import VariationalPosterior import numpy as np from ...util.misc import param_to_array +from . import LatentFunctionInference log_2_pi = np.log(2*np.pi) from ...util import gpu_init @@ -15,11 +16,11 @@ try: import scikits.cuda.linalg as culinalg import pycuda.gpuarray as gpuarray from scikits.cuda import cublas - from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis, outer_prod, mul_bcast_first, join_prod + from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis, outer_prod, mul_bcast_first, join_prod, traceDot except: pass -class VarDTC_GPU(object): +class VarDTC_GPU(LatentFunctionInference): """ An object for inference when the likelihood is Gaussian, but we want to do sparse inference. @@ -65,18 +66,13 @@ class VarDTC_GPU(object): 'beta_gpu' :gpuarray.empty((ndata,),np.float64,order='F'), 'YT_gpu' :gpuarray.to_gpu(np.asfortranarray(Y.T)), # DxN 'betaYT_gpu' :gpuarray.empty(Y.T.shape,np.float64,order='F'), # DxN - 'psi2_t_gpu' :gpuarray.empty((num_inducing*num_inducing*self.batchsize),np.float64,order='F'), # inference_minibatch 'dL_dpsi0_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), 'dL_dpsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'), - 'dL_dpsi2_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'), - 'dL_dthetaL_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), - 'betapsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'), - 'thetaL_t_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), - 'betaYT2_gpu' :gpuarray.empty((output_dim,self.batchsize),np.float64,order='F'), + 'dL_dpsi2_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), 'psi0p_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), 'psi1p_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'), - 'psi2p_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'), + 'psi2p_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), } self.gpuCache['ones_gpu'].fill(1.0) @@ -125,6 +121,89 @@ class VarDTC_GPU(object): else: return jitchol(tdot(Y)) + def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs, het_noise): + num_inducing, input_dim = Z.shape[0], Z.shape[1] + num_data, output_dim = Y.shape + trYYT = self._trYYT + psi1Y_gpu = self.gpuCache['psi1Y_gpu'] + psi2_gpu = self.gpuCache['psi2_gpu'] + beta_gpu = self.gpuCache['beta_gpu'] + YT_gpu = self.gpuCache['YT_gpu'] + betaYT_gpu = self.gpuCache['betaYT_gpu'] + + beta_gpu.fill(beta) + betaYT_gpu.fill(0.) + cublas.cublasDaxpy(self.cublas_handle, betaYT_gpu.size, beta, YT_gpu.gpudata, 1, betaYT_gpu.gpudata, 1) + YRY_full = trYYT*beta + + if kern.useGPU: + psi1Y_gpu.fill(0.) + psi2_gpu.fill(0.) + psi0_full = 0 + + for n_start in xrange(0,num_data,self.batchsize): + n_end = min(self.batchsize+n_start, num_data) + ndata = n_end - n_start + X_slice = X[n_start:n_end] + betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] + + if uncertain_inputs: + psi0 = kern.psi0(Z, X_slice) + psi1p_gpu = kern.psi1(Z, X_slice) + psi2p_gpu = kern.psi2(Z, X_slice) + else: + psi0 = kern.Kdiag(X_slice) + psi1p_gpu = kern.K(X_slice, Z) + + cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing) + + psi0_full += psi0.sum() + + if uncertain_inputs: + sum_axis(psi2_gpu,psi2p_gpu,1,1) + else: + cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, beta, psi1p_gpu.gpudata, ndata, psi1p_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing) + + psi0_full *= beta + if uncertain_inputs: + cublas.cublasDscal(self.cublas_handle, psi2_gpu.size, beta, psi2_gpu.gpudata, 1) + + else: + psi2_full = np.zeros((num_inducing,num_inducing)) + psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM + psi0_full = 0. + YRY_full = 0. + + for n_start in xrange(0,num_data,self.batchsize): + n_end = min(self.batchsize+n_start, num_data) + Y_slice = Y[n_start:n_end] + X_slice = X[n_start:n_end] + + if het_noise: + b = beta[n_start] + YRY_full += np.inner(Y_slice, Y_slice)*b + else: + b = beta + + if uncertain_inputs: + psi0 = kern.psi0(Z, X_slice) + psi1 = kern.psi1(Z, X_slice) + psi2_full += kern.psi2(Z, X_slice)*b + else: + psi0 = kern.Kdiag(X_slice) + psi1 = kern.K(X_slice, Z) + psi2_full += np.dot(psi1.T,psi1)*b + + psi0_full += psi0.sum()*b + psi1Y_full += np.dot(Y_slice.T,psi1)*b # DxM + + if not het_noise: + YRY_full = trYYT*beta + psi1Y_gpu.set(psi1Y_full) + psi2_gpu.set(psi2_full) + + return psi0_full, YRY_full + def inference_likelihood(self, kern, X, Z, likelihood, Y): """ The first phase of inference: @@ -136,6 +215,12 @@ class VarDTC_GPU(object): num_inducing, input_dim = Z.shape[0], Z.shape[1] num_data, output_dim = Y.shape + #see whether we've got a different noise variance for each datum + beta = 1./np.fmax(likelihood.variance, 1e-6) + het_noise = beta.size > 1 + if het_noise: + self.batchsize=0 + self._initGPUCache(kern, num_inducing, input_dim, output_dim, Y) if isinstance(X, VariationalPosterior): @@ -143,123 +228,10 @@ class VarDTC_GPU(object): else: uncertain_inputs = False - #see whether we've got a different noise variance for each datum - beta = 1./np.fmax(likelihood.variance, 1e-6) - het_noise = beta.size > 1 - trYYT = self._trYYT - psi1Y_gpu = self.gpuCache['psi1Y_gpu'] psi2_gpu = self.gpuCache['psi2_gpu'] - beta_gpu = self.gpuCache['beta_gpu'] - YT_gpu = self.gpuCache['YT_gpu'] - betaYT_gpu = self.gpuCache['betaYT_gpu'] - psi2_t_gpu = self.gpuCache['psi2_t_gpu'] - if het_noise: - beta_gpu.set(np.asfortranarray(beta)) - mul_bcast(betaYT_gpu,beta_gpu,YT_gpu,beta_gpu.size) - YRY_full = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, betaYT_gpu.gpudata, 1, YT_gpu.gpudata, 1) - else: - beta_gpu.fill(beta) - betaYT_gpu.fill(0.) - cublas.cublasDaxpy(self.cublas_handle, betaYT_gpu.size, beta, YT_gpu.gpudata, 1, betaYT_gpu.gpudata, 1) - YRY_full = trYYT*beta - - if kern.useGPU: - psi1Y_gpu.fill(0.) - psi2_gpu.fill(0.) - psi0_full = 0 - - for n_start in xrange(0,num_data,self.batchsize): - n_end = min(self.batchsize+n_start, num_data) - ndata = n_end - n_start - X_slice = X[n_start:n_end] - beta_gpu_slice = beta_gpu[n_start:n_end] - betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] - if ndata==self.batchsize: - psi2_t_gpu_slice = psi2_t_gpu - else: - psi2_t_gpu_slice = psi2_t_gpu[:num_inducing*num_inducing*ndata] - if uncertain_inputs: - psi0p_gpu = kern.psi0(Z, X_slice) - psi1p_gpu = kern.psi1(Z, X_slice) - psi2p_gpu = kern.psi2(Z, X_slice) - else: - psi0p_gpu = kern.Kdiag(X_slice) - psi1p_gpu = kern.K(X_slice, Z) - - cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing) - - if het_noise: - psi0_full += cublas.cublasDdot(self.cublas_handle, psi0p_gpu.size, beta_gpu_slice.gpudata, 1, psi0p_gpu.gpudata, 1) - else: - psi0_full += gpuarray.sum(psi0p_gpu).get() - - if uncertain_inputs: - if het_noise: - mul_bcast(psi2_t_gpu_slice,beta_gpu_slice,psi2p_gpu,beta_gpu_slice.size) - sum_axis(psi2_gpu,psi2_t_gpu_slice,1,ndata) - else: - sum_axis(psi2_gpu,psi2p_gpu,1,ndata) - else: - if het_noise: - psi1_t_gpu = psi2_t_gpu_slice[:,num_inducing*ndata] - mul_bcast(psi1_t_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size) - cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, 1.0, psi1p_gpu.gpudata, ndata, psi1_t_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing) - else: - cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, beta, psi1p_gpu.gpudata, ndata, psi1p_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing) - - if not het_noise: - psi0_full *= beta - if uncertain_inputs: - cublas.cublasDscal(self.cublas_handle, psi2_gpu.size, beta, psi2_gpu.gpudata, 1) - - else: - psi2_full = np.zeros((num_inducing,num_inducing),order='F') - psi1Y_full = np.zeros((num_inducing,output_dim),order='F') # MxD - psi0_full = 0 -# YRY_full = 0 - - for n_start in xrange(0,num_data,self.batchsize): - n_end = min(self.batchsize+n_start, num_data) - Y_slice = Y[n_start:n_end] - X_slice = X[n_start:n_end] - if uncertain_inputs: - psi0 = kern.psi0(Z, X_slice) - psi1 = kern.psi1(Z, X_slice) - psi2 = kern.psi2(Z, X_slice) - else: - psi0 = kern.Kdiag(X_slice) - psi1 = kern.K(X_slice, Z) - - if het_noise: - beta_slice = beta[n_start:n_end] - psi0_full += (beta_slice*psi0).sum() - psi1Y_full += np.dot(psi1.T,beta_slice[:,None]*Y_slice) # MxD -# YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum() - else: - psi0_full += psi0.sum() - psi1Y_full += np.dot(psi1.T,Y_slice) # MxD - - if uncertain_inputs: - if het_noise: - psi2_full += np.einsum('n,nmo->mo',beta_slice,psi2) - else: - psi2_full += psi2.sum(axis=0) - else: - if het_noise: - psi2_full += np.einsum('n,nm,no->mo',beta_slice,psi1,psi1) - else: - psi2_full += tdot(psi1.T) - - if not het_noise: - psi0_full *= beta - psi1Y_full *= beta - psi2_full *= beta -# YRY_full = trYYT*beta - - psi1Y_gpu.set(psi1Y_full) - psi2_gpu.set(psi2_full) + psi0_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs, het_noise) #====================================================================== # Compute Common Components @@ -372,6 +344,16 @@ class VarDTC_GPU(object): post = Posterior(woodbury_inv=KmmInvPsi2P_gpu.get(), woodbury_vector=v_gpu.get(), K=Kmm_gpu.get(), mean=None, cov=None, K_chol=Lm_gpu.get()) + #====================================================================== + # Compute dL_dthetaL for uncertian input and non-heter noise + #====================================================================== + + if not het_noise: + dL_dthetaL = (YRY_full + output_dim*psi0_full - num_data*output_dim)/-2. + dL_dthetaL += cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2_gpu.gpudata,1) + dL_dthetaL += cublas.cublasDdot(self.cublas_handle,v_gpu.size, v_gpu.gpudata,1,psi1Y_gpu.gpudata,1) + self.midRes['dL_dthetaL'] = -beta*dL_dthetaL + return logL, dL_dKmm_gpu.get(), post def inference_minibatch(self, kern, X, Z, likelihood, Y): @@ -403,26 +385,26 @@ class VarDTC_GPU(object): nSlice = n_end-n_start X_slice = X[n_start:n_end] + if het_noise: + beta = beta[n_start] # nSlice==1 if kern.useGPU: - if uncertain_inputs: - psi0p_gpu = kern.psi0(Z, X_slice) - psi1p_gpu = kern.psi1(Z, X_slice) - psi2p_gpu = kern.psi2(Z, X_slice) - else: + if not uncertain_inputs: psi0p_gpu = kern.Kdiag(X_slice) psi1p_gpu = kern.K(X_slice, Z) psi2p_gpu = self.gpuCache['psi2p_gpu'] - if psi2p_gpu.shape[0] > nSlice: - psi2p_gpu = psi2p_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) - else: - if uncertain_inputs: + elif het_noise: + psi0p_gpu = kern.psi0(Z, X_slice) + psi1p_gpu = kern.psi1(Z, X_slice) + psi2p_gpu = kern.psi2(Z, X_slice) + elif not uncertain_inputs or het_noise: + if not uncertain_inputs: + psi0 = kern.Kdiag(X_slice) + psi1 = kern.K(X_slice, Z) + elif het_noise: psi0 = kern.psi0(Z, X_slice) psi1 = kern.psi1(Z, X_slice) psi2 = kern.psi2(Z, X_slice) - else: - psi0 = kern.Kdiag(X_slice) - psi1 = kern.K(X_slice, Z) psi0p_gpu = self.gpuCache['psi0p_gpu'] psi1p_gpu = self.gpuCache['psi1p_gpu'] @@ -430,91 +412,46 @@ class VarDTC_GPU(object): if psi0p_gpu.shape[0] > nSlice: psi0p_gpu = psi0p_gpu[:nSlice] psi1p_gpu = psi1p_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) - psi2p_gpu = psi2p_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) psi0p_gpu.set(np.asfortranarray(psi0)) psi1p_gpu.set(np.asfortranarray(psi1)) if uncertain_inputs: psi2p_gpu.set(np.asfortranarray(psi2)) - - #====================================================================== - # Prepare gpu memory - #====================================================================== - - dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu'] - v_gpu = self.gpuCache['v_gpu'] - betaYT_gpu = self.gpuCache['betaYT_gpu'] - beta_gpu = self.gpuCache['beta_gpu'] - dL_dpsi0_gpu = self.gpuCache['dL_dpsi0_gpu'] - dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu'] - dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu'] - dL_dthetaL_gpu = self.gpuCache['dL_dthetaL_gpu'] - psi2R_gpu = self.gpuCache['psi2_t_gpu'][:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) - betapsi1_gpu = self.gpuCache['betapsi1_gpu'] - thetaL_t_gpu = self.gpuCache['thetaL_t_gpu'] - betaYT2_gpu = self.gpuCache['betaYT2_gpu'] - - betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] - beta_gpu_slice = beta_gpu[n_start:n_end] - - # Adjust to the batch size - if dL_dpsi0_gpu.shape[0] > nSlice: - betaYT2_gpu = betaYT2_gpu[:,:nSlice] - dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice] - dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) - dL_dpsi2_gpu = dL_dpsi2_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) - dL_dthetaL_gpu = dL_dthetaL_gpu.ravel()[:nSlice] - psi2R_gpu = psi2R_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) - thetaL_t_gpu = thetaL_t_gpu.ravel()[:nSlice] - betapsi1_gpu = betapsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) - - mul_bcast(betapsi1_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size) - + #====================================================================== # Compute dL_dpsi #====================================================================== + + dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu'] + v_gpu = self.gpuCache['v_gpu'] + dL_dpsi0_gpu = self.gpuCache['dL_dpsi0_gpu'] + dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu'] + dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu'] + betaYT_gpu = self.gpuCache['betaYT_gpu'] + betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] - dL_dpsi0_gpu.fill(0.) - cublas.cublasDaxpy(self.cublas_handle, dL_dpsi0_gpu.size, output_dim/(-2.), beta_gpu_slice.gpudata, 1, dL_dpsi0_gpu.gpudata, 1) + # Adjust to the batch size + if dL_dpsi0_gpu.shape[0] > nSlice: + dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice] + dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) + + dL_dpsi0_gpu.fill(-output_dim *beta/2.) cublas.cublasDgemm(self.cublas_handle, 'T', 'T', nSlice, num_inducing, output_dim, 1.0, betaYT_gpu_slice.gpudata, output_dim, v_gpu.gpudata, num_inducing, 0., dL_dpsi1_gpu.gpudata, nSlice) if uncertain_inputs: - outer_prod(dL_dpsi2_gpu,beta_gpu_slice,dL_dpsi2R_gpu,beta_gpu_slice.size) + cublas.cublasDcopy(self.cublas_handle, dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata, 1, dL_dpsi2_gpu.gpudata, 1) + cublas.cublasDscal(self.cublas_handle, dL_dpsi2_gpu.size, beta, dL_dpsi2_gpu.gpudata, 1) else: - cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, 1.0, betapsi1_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice) - + cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, beta, psi1p_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice) + #====================================================================== # Compute dL_dthetaL #====================================================================== - - if not uncertain_inputs: - join_prod(psi2p_gpu,psi1p_gpu,psi1p_gpu,nSlice,num_inducing) - - mul_bcast_first(psi2R_gpu,dL_dpsi2R_gpu,psi2p_gpu,nSlice) - - - dL_dthetaL_gpu.fill(0.) - - cublas.cublasDcopy(self.cublas_handle, betaYT_gpu_slice.size, betaYT_gpu_slice.gpudata, 1, betaYT2_gpu.gpudata, 1) - mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT2_gpu,betaYT2_gpu.size) - cublas.cublasDscal(self.cublas_handle, betaYT2_gpu.size, 0.5, betaYT2_gpu.gpudata, 1) - sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim) - - cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/(-2.0), beta_gpu_slice.gpudata, 1, dL_dthetaL_gpu.gpudata, 1) - cublas.cublasDcopy(self.cublas_handle, beta_gpu_slice.size, beta_gpu_slice.gpudata, 1, thetaL_t_gpu.gpudata, 1) - mul_bcast(thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu.size) - mul_bcast(thetaL_t_gpu,thetaL_t_gpu,psi0p_gpu,thetaL_t_gpu.size) - cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/2.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1) - - thetaL_t_gpu.fill(0.) - sum_axis(thetaL_t_gpu, psi2R_gpu, nSlice, num_inducing*num_inducing) - mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size) - mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size) - cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, -1.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1) - - cublas.cublasDgemm(self.cublas_handle, 'T', 'T', output_dim, nSlice, num_inducing, -1.0, v_gpu.gpudata, num_inducing, betapsi1_gpu.gpudata, nSlice, 0.0, betaYT2_gpu.gpudata, output_dim) - mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT_gpu_slice,betaYT2_gpu.size) - sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim) + if het_noise: + betaY = betaYT_gpu_slice.get() + dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0p_gpu.get())-output_dim*beta)/2. + dL_dthetaL += -beta*beta*cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2p_gpu.gpudata,1) + dL_dthetaL += -beta*(betaY*np.dot(psi1p_gpu.get(),v_gpu.get())).sum(axis=-1) if kern.useGPU: dL_dpsi0 = dL_dpsi0_gpu @@ -527,10 +464,11 @@ class VarDTC_GPU(object): dL_dpsi2 = dL_dpsi2_gpu else: dL_dpsi2 = dL_dpsi2_gpu.get() - if het_noise: - dL_dthetaL = dL_dthetaL_gpu.get() - else: - dL_dthetaL = gpuarray.sum(dL_dthetaL_gpu).get() + if not het_noise: + if isEnd: + dL_dthetaL = self.midRes['dL_dthetaL'] + else: + dL_dthetaL = 0. if uncertain_inputs: grad_dict = {'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, @@ -540,6 +478,6 @@ class VarDTC_GPU(object): grad_dict = {'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} - + return isEnd, (n_start,n_end), grad_dict diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py index 87236e2a..b834d942 100644 --- a/GPy/inference/latent_function_inference/var_dtc_parallel.py +++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py @@ -7,9 +7,15 @@ from ...util import diag from ...core.parameterization.variational import VariationalPosterior import numpy as np from ...util.misc import param_to_array +from . import LatentFunctionInference log_2_pi = np.log(2*np.pi) -class VarDTC_minibatch(object): +try: + from mpi4py import MPI +except: + pass + +class VarDTC_minibatch(LatentFunctionInference): """ An object for inference when the likelihood is Gaussian, but we want to do sparse inference. @@ -20,9 +26,11 @@ class VarDTC_minibatch(object): """ const_jitter = 1e-6 - def __init__(self, batchsize, limit=1): + def __init__(self, batchsize=None, limit=1, mpi_comm=None): self.batchsize = batchsize + self.mpi_comm = mpi_comm + self.limit = limit # Cache functions from ...util.caching import Cacher @@ -31,6 +39,21 @@ class VarDTC_minibatch(object): self.midRes = {} self.batch_pos = 0 # the starting position of the current mini-batch + self.Y_speedup = False # Replace Y with the cholesky factor of YY.T, but the posterior inference will be wrong + + def __getstate__(self): + # has to be overridden, as Cacher objects cannot be pickled. + return self.batchsize, self.limit, self.Y_speedup + + def __setstate__(self, state): + # has to be overridden, as Cacher objects cannot be pickled. + self.batchsize, self.limit, self.Y_speedup = state + self.mpi_comm = None + self.midRes = {} + self.batch_pos = 0 + from ...util.caching import Cacher + self.get_trYYT = Cacher(self._get_trYYT, self.limit) + self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit) def set_limit(self, limit): self.get_trYYT.limit = limit @@ -51,6 +74,67 @@ class VarDTC_minibatch(object): else: return jitchol(tdot(Y)) + def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs): + + het_noise = beta.size > 1 + + trYYT = self.get_trYYT(Y) + if self.Y_speedup and not het_noise: + Y = self.get_YYTfactor(Y) + + num_inducing = Z.shape[0] + num_data, output_dim = Y.shape + if self.batchsize == None: + self.batchsize = num_data + + psi2_full = np.zeros((num_inducing,num_inducing)) + psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM + psi0_full = 0. + YRY_full = 0. + + for n_start in xrange(0,num_data,self.batchsize): + n_end = min(self.batchsize+n_start, num_data) + if (n_end-n_start)==num_data: + Y_slice = Y + X_slice = X + else: + Y_slice = Y[n_start:n_end] + X_slice = X[n_start:n_end] + + if het_noise: + b = beta[n_start] + YRY_full += np.inner(Y_slice, Y_slice)*b + else: + b = beta + + if uncertain_inputs: + psi0 = kern.psi0(Z, X_slice) + psi1 = kern.psi1(Z, X_slice) + psi2_full += kern.psi2(Z, X_slice)*b + else: + psi0 = kern.Kdiag(X_slice) + psi1 = kern.K(X_slice, Z) + psi2_full += np.dot(psi1.T,psi1)*b + + psi0_full += psi0.sum()*b + psi1Y_full += np.dot(Y_slice.T,psi1)*b # DxM + + if not het_noise: + YRY_full = trYYT*beta + + if self.mpi_comm != None: + psi0_all = np.array(psi0_full) + psi1Y_all = psi1Y_full.copy() + psi2_all = psi2_full.copy() + YRY_all = np.array(YRY_full) + self.mpi_comm.Allreduce([psi0_full, MPI.DOUBLE], [psi0_all, MPI.DOUBLE]) + self.mpi_comm.Allreduce([psi1Y_full, MPI.DOUBLE], [psi1Y_all, MPI.DOUBLE]) + self.mpi_comm.Allreduce([psi2_full, MPI.DOUBLE], [psi2_all, MPI.DOUBLE]) + self.mpi_comm.Allreduce([YRY_full, MPI.DOUBLE], [YRY_all, MPI.DOUBLE]) + return psi0_all, psi1Y_all, psi2_all, YRY_all + + return psi0_full, psi1Y_full, psi2_full, YRY_full + def inference_likelihood(self, kern, X, Z, likelihood, Y): """ The first phase of inference: @@ -59,8 +143,11 @@ class VarDTC_minibatch(object): Cached intermediate results: Kmm, KmmInv, """ - num_inducing = Z.shape[0] - num_data, output_dim = Y.shape + num_data, output_dim = Y.shape + if self.mpi_comm != None: + num_data_all = np.array(num_data,dtype=np.int32) + self.mpi_comm.Allreduce([np.int32(num_data), MPI.INT], [num_data_all, MPI.INT]) + num_data = num_data_all if isinstance(X, VariationalPosterior): uncertain_inputs = True @@ -70,75 +157,31 @@ class VarDTC_minibatch(object): #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.variance, 1e-6) het_noise = beta.size > 1 - # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! - #self.YYTfactor = beta*self.get_YYTfactor(Y) - YYT_factor = Y - trYYT = self.get_trYYT(Y) + if het_noise: + self.batchsize = 1 + + psi0_full, psi1Y_full, psi2_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) - - psi2_full = np.zeros((num_inducing,num_inducing)) - psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM - psi0_full = 0 - YRY_full = 0 - - for n_start in xrange(0,num_data,self.batchsize): - - n_end = min(self.batchsize+n_start, num_data) - - Y_slice = YYT_factor[n_start:n_end] - X_slice = X[n_start:n_end] - - if uncertain_inputs: - psi0 = kern.psi0(Z, X_slice) - psi1 = kern.psi1(Z, X_slice) - psi2 = kern.psi2(Z, X_slice) - else: - psi0 = kern.Kdiag(X_slice) - psi1 = kern.K(X_slice, Z) - psi2 = None - - if het_noise: - beta_slice = beta[n_start:n_end] - psi0_full += (beta_slice*psi0).sum() - psi1Y_full += np.dot(beta_slice*Y_slice.T,psi1) # DxM - YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum() - else: - psi0_full += psi0.sum() - psi1Y_full += np.dot(Y_slice.T,psi1) # DxM - - - if uncertain_inputs: - if het_noise: - psi2_full += np.einsum('n,nmo->mo',beta_slice,psi2) - else: - psi2_full += psi2.sum(axis=0) - else: - if het_noise: - psi2_full += np.einsum('n,nm,no->mo',beta_slice,psi1,psi1) - else: - psi2_full += tdot(psi1.T) - - if not het_noise: - psi0_full *= beta - psi1Y_full *= beta - psi2_full *= beta - YRY_full = trYYT*beta - #====================================================================== # Compute Common Components #====================================================================== + from ...util.debug import checkFullRank + Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) + checkFullRank(Kmm) Lm = jitchol(Kmm) - - Lambda = Kmm+psi2_full + + LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right') + Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT + checkFullRank(Lambda) LL = jitchol(Lambda) + LL = np.dot(Lm,LL) b,_ = dtrtrs(LL, psi1Y_full.T) bbt = np.square(b).sum() v,_ = dtrtrs(LL.T,b,lower=False) vvt = np.einsum('md,od->mo',v,v) - LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right') Psi2LLInvT = dtrtrs(LL,psi2_full)[0].T LmInvPsi2LLInvT= dtrtrs(Lm,Psi2LLInvT)[0] @@ -170,12 +213,18 @@ class VarDTC_minibatch(object): # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== -# phi_u_mean = np.dot(Kmm,v) -# LLInvKmm,_ = dtrtrs(LL,Kmm) -# # phi_u_var = np.einsum('ma,mb->ab',LLInvKmm,LLInvKmm) -# phi_u_var = Kmm - np.dot(LLInvKmm.T,LLInvKmm) + if not self.Y_speedup or het_noise: + post = Posterior(woodbury_inv=KmmInvPsi2P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm) + else: + post = None - post = Posterior(woodbury_inv=KmmInvPsi2P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm) + #====================================================================== + # Compute dL_dthetaL for uncertian input and non-heter noise + #====================================================================== + + if not het_noise: + dL_dthetaL = (YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2_full).sum() - beta*(v.T*psi1Y_full).sum() + self.midRes['dL_dthetaL'] = dL_dthetaL return logL, dL_dKmm, post @@ -198,7 +247,10 @@ class VarDTC_minibatch(object): het_noise = beta.size > 1 # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! #self.YYTfactor = beta*self.get_YYTfactor(Y) - YYT_factor = Y + if self.Y_speedup and not het_noise: + YYT_factor = self.get_YYTfactor(Y) + else: + YYT_factor = Y n_start = self.batch_pos n_end = min(self.batchsize+n_start, num_data) @@ -209,24 +261,24 @@ class VarDTC_minibatch(object): isEnd = False self.batch_pos = n_end - num_slice = n_end-n_start Y_slice = YYT_factor[n_start:n_end] X_slice = X[n_start:n_end] - if uncertain_inputs: - psi0 = kern.psi0(Z, X_slice) - psi1 = kern.psi1(Z, X_slice) - psi2 = kern.psi2(Z, X_slice) - else: + if not uncertain_inputs: psi0 = kern.Kdiag(X_slice) psi1 = kern.K(X_slice, Z) psi2 = None + betapsi1 = np.einsum('n,nm->nm',beta,psi1) + elif het_noise: + psi0 = kern.psi0(Z, X_slice) + psi1 = kern.psi1(Z, X_slice) + psi2 = kern.psi2(Z, X_slice) + betapsi1 = np.einsum('n,nm->nm',beta,psi1) if het_noise: - beta = beta[n_start:n_end] + beta = beta[n_start] # assuming batchsize==1 betaY = beta*Y_slice - betapsi1 = np.einsum('n,nm->nm',beta,psi1) #====================================================================== # Load Intermediate Results @@ -234,17 +286,17 @@ class VarDTC_minibatch(object): dL_dpsi2R = self.midRes['dL_dpsi2R'] v = self.midRes['v'] - + #====================================================================== # Compute dL_dpsi #====================================================================== - dL_dpsi0 = -0.5 * output_dim * (beta * np.ones((n_end-n_start,))) + dL_dpsi0 = -output_dim * (beta * np.ones((n_end-n_start,)))/2. dL_dpsi1 = np.dot(betaY,v.T) if uncertain_inputs: - dL_dpsi2 = np.einsum('n,mo->nmo',beta * np.ones((n_end-n_start,)),dL_dpsi2R) + dL_dpsi2 = beta* dL_dpsi2R else: dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2. dL_dpsi2 = None @@ -255,19 +307,17 @@ class VarDTC_minibatch(object): if het_noise: if uncertain_inputs: - psiR = np.einsum('mo,nmo->n',dL_dpsi2R,psi2) - else: - psiR = np.einsum('nm,no,mo->n',psi1,psi1,dL_dpsi2R) - - dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1) - else: - if uncertain_inputs: - psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2) + psiR = np.einsum('mo,mo->',dL_dpsi2R,psi2) else: psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R) - dL_dthetaL = ((np.square(betaY)).sum() + np.square(beta)*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum() - + dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1) + else: + if isEnd: + dL_dthetaL = self.midRes['dL_dthetaL'] + else: + dL_dthetaL = 0. + if uncertain_inputs: grad_dict = {'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, @@ -281,36 +331,45 @@ class VarDTC_minibatch(object): return isEnd, (n_start,n_end), grad_dict -def update_gradients(model): - model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, model.X, model.Z, model.likelihood, model.Y) +def update_gradients(model, mpi_comm=None): + if mpi_comm == None: + Y = model.Y + X = model.X + else: + Y = model.Y_local + X = model.X[model.N_range[0]:model.N_range[1]] + + model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, X, model.Z, model.likelihood, Y) het_noise = model.likelihood.variance.size > 1 if het_noise: dL_dthetaL = np.empty((model.Y.shape[0],)) else: - dL_dthetaL = 0 - - #gradients w.r.t. kernel - model.kern.update_gradients_full(dL_dKmm, model.Z, None) + dL_dthetaL = np.float64(0.) + kern_grad = model.kern.gradient.copy() - - #gradients w.r.t. Z - model.Z.gradient[:,model.kern.active_dims] = model.kern.gradients_X(dL_dKmm, model.Z) + kern_grad[:] = 0. + model.Z.gradient = 0. isEnd = False while not isEnd: - isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, model.X, model.Z, model.likelihood, model.Y) + isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, X, model.Z, model.likelihood, Y) if isinstance(model.X, VariationalPosterior): - X_slice = model.X[n_range[0]:n_range[1]] + if (n_range[1]-n_range[0])==X.shape[0]: + X_slice = X + elif mpi_comm ==None: + X_slice = model.X[n_range[0]:n_range[1]] + else: + X_slice = model.X[model.N_range[0]+n_range[0]:model.N_range[0]+n_range[1]] #gradients w.r.t. kernel model.kern.update_gradients_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2']) kern_grad += model.kern.gradient #gradients w.r.t. Z - model.Z.gradient[:,model.kern.active_dims] += model.kern.gradients_Z_expectations( - grad_dict['dL_dpsi1'], grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice) + model.Z.gradient += model.kern.gradients_Z_expectations( + dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice) #gradients w.r.t. posterior parameters of X X_grad = model.kern.gradients_qX_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2']) @@ -321,13 +380,39 @@ def update_gradients(model): else: dL_dthetaL += grad_dict['dL_dthetaL'] - # Set the gradients w.r.t. kernel - model.kern.gradient = kern_grad - - # Update Log-likelihood - model._log_marginal_likelihood -= model.variational_prior.KL_divergence(model.X) - # update for the KL divergence - model.variational_prior.update_gradients_KL(model.X) + # Gather the gradients from multiple MPI nodes + if mpi_comm != None: + if het_noise: + raise "het_noise not implemented!" + kern_grad_all = kern_grad.copy() + Z_grad_all = model.Z.gradient.copy() + mpi_comm.Allreduce([kern_grad, MPI.DOUBLE], [kern_grad_all, MPI.DOUBLE]) + mpi_comm.Allreduce([model.Z.gradient, MPI.DOUBLE], [Z_grad_all, MPI.DOUBLE]) + kern_grad = kern_grad_all + model.Z.gradient = Z_grad_all + #gradients w.r.t. kernel + model.kern.update_gradients_full(dL_dKmm, model.Z, None) + model.kern.gradient += kern_grad + + #gradients w.r.t. Z + model.Z.gradient += model.kern.gradients_X(dL_dKmm, model.Z) + + # Update Log-likelihood + KL_div = model.variational_prior.KL_divergence(X) + # update for the KL divergence + model.variational_prior.update_gradients_KL(X) + + if mpi_comm != None: + KL_div_all = np.array(KL_div) + mpi_comm.Allreduce([np.float64(KL_div), MPI.DOUBLE], [KL_div_all, MPI.DOUBLE]) + KL_div = KL_div_all + [mpi_comm.Allgatherv([pp.copy(), MPI.DOUBLE], [pa, (model.N_list*pa.shape[-1], None), MPI.DOUBLE]) for pp,pa in zip(model.get_X_gradients(X),model.get_X_gradients(model.X))] +# from ...models import SSGPLVM +# if isinstance(model, SSGPLVM): +# grad_pi = np.array(model.variational_prior.pi.gradient) +# mpi_comm.Allreduce([grad_pi.copy(), MPI.DOUBLE], [model.variational_prior.pi.gradient, MPI.DOUBLE]) + model._log_marginal_likelihood -= KL_div + # dL_dthetaL model.likelihood.update_gradients(dL_dthetaL) diff --git a/GPy/inference/optimization/__init__.py b/GPy/inference/optimization/__init__.py index 1a8f043b..1590568f 100644 --- a/GPy/inference/optimization/__init__.py +++ b/GPy/inference/optimization/__init__.py @@ -1,2 +1,3 @@ from scg import SCG from optimization import * +from hmc import HMC,HMC_shortcut diff --git a/GPy/inference/optimization/hmc.py b/GPy/inference/optimization/hmc.py new file mode 100644 index 00000000..8c65cdf0 --- /dev/null +++ b/GPy/inference/optimization/hmc.py @@ -0,0 +1,157 @@ +"""HMC implementation""" + +import numpy as np + + +class HMC: + def __init__(self,model,M=None,stepsize=1e-1): + self.model = model + self.stepsize = stepsize + self.p = np.empty_like(model.optimizer_array.copy()) + if M is None: + self.M = np.eye(self.p.size) + else: + self.M = M + self.Minv = np.linalg.inv(self.M) + + def sample(self, m_iters=1000, hmc_iters=20): + params = np.empty((m_iters,self.p.size)) + for i in xrange(m_iters): + self.p[:] = np.random.multivariate_normal(np.zeros(self.p.size),self.M) + H_old = self._computeH() + theta_old = self.model.optimizer_array.copy() + params[i] = self.model.unfixed_param_array + #Matropolis + self._update(hmc_iters) + H_new = self._computeH() + + if H_old>H_new: + k = 1. + else: + k = np.exp(H_old-H_new) + if np.random.rand()H_new: + k = 1. + else: + k = np.exp(H_old-H_new) + if np.random.rand()pos: + pos = -1 + i += pos + self.model.optimizer_array = theta_buf[hmc_iters] + self.p[:] = -p_buf[hmc_iters] + else: + pos_new = pos-hmc_iters+i + self.model.optimizer_array = theta_buf[hmc_iters+pos_new] + self.p[:] = -p_buf[hmc_iters+pos_new] + break + else: + Hlist = range(hmc_iters+pos,hmc_iters+pos+self.groupsize) +# print Hlist +# print self._testH(H_buf[Hlist]) + + if self._testH(H_buf[Hlist]): + pos += -1 + else: + # Reverse the trajectory for the 2nd time + r = (hmc_iters - i)%((reversal[0]-pos)*2) + if r>(reversal[0]-pos): + pos_new = 2*reversal[0] - r - pos + else: + pos_new = pos + r + self.model.optimizer_array = theta_buf[hmc_iters+pos_new] + self.p[:] = p_buf[hmc_iters+pos_new] # the sign of momentum might be wrong! +# print reversal[0],pos,pos_new +# print H_buf + break + + def _testH(self, Hlist): + Hstd = np.std(Hlist) +# print Hlist +# print Hstd + if Hstdself.Hstd_th[1]: + return False + else: + return True + + def _computeH(self,): + return self.model.objective_function()+self.p.size*np.log(2*np.pi)/2.+np.log(np.linalg.det(self.M))/2.+np.dot(self.p, np.dot(self.Minv,self.p[:,None]))/2. + diff --git a/GPy/inference/optimization/scg.py b/GPy/inference/optimization/scg.py index c99fa7d1..e183b7a8 100644 --- a/GPy/inference/optimization/scg.py +++ b/GPy/inference/optimization/scg.py @@ -32,7 +32,7 @@ def print_out(len_maxiters, fnow, current_grad, beta, iteration): sys.stdout.flush() def exponents(fnow, current_grad): - exps = [np.abs(fnow), current_grad] + exps = [np.abs(np.float(fnow)), current_grad] return np.sign(exps) * np.log10(exps).astype(int) def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True, xtol=None, ftol=None, gtol=None): @@ -56,13 +56,13 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True, if gtol is None: gtol = 1e-5 - sigma0 = 1.0e-8 + sigma0 = 1.0e-7 fold = f(x, *optargs) # Initial function value. function_eval = 1 fnow = fold gradnew = gradf(x, *optargs) # Initial gradient. - if any(np.isnan(gradnew)): - raise UnexpectedInfOrNan, "Gradient contribution resulted in a NaN value" + #if any(np.isnan(gradnew)): + # raise UnexpectedInfOrNan, "Gradient contribution resulted in a NaN value" current_grad = np.dot(gradnew, gradnew) gradold = gradnew.copy() d = -gradnew # Initial search direction. @@ -168,13 +168,13 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True, if Delta < 0.25: beta = min(4.0 * beta, betamax) if Delta > 0.75: - beta = max(0.5 * beta, betamin) + beta = max(0.25 * beta, betamin) # Update search direction using Polak-Ribiere formula, or re-start # in direction of negative gradient after nparams steps. if nsuccess == x.size: d = -gradnew -# beta = 1. # TODO: betareset!! + beta = 1. # This is not in the original paper nsuccess = 0 elif success: Gamma = np.dot(gradold - gradnew, gradnew) / (mu) diff --git a/GPy/installation.cfg b/GPy/installation.cfg new file mode 100644 index 00000000..901a7ef5 --- /dev/null +++ b/GPy/installation.cfg @@ -0,0 +1,2 @@ +# This is the local installation configuration file for GPy + diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py index 1ed5e805..39529843 100644 --- a/GPy/kern/__init__.py +++ b/GPy/kern/__init__.py @@ -3,16 +3,20 @@ from _src.rbf import RBF from _src.linear import Linear, LinearFull from _src.static import Bias, White from _src.brownian import Brownian -from _src.stationary import Exponential, Matern32, Matern52, ExpQuad, RatQuad, Cosine +from _src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine from _src.mlp import MLP from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52 from _src.independent_outputs import IndependentOutputs, Hierarchical from _src.coregionalize import Coregionalize -from _src.ssrbf import SSRBF # TODO: ZD: did you remove this? from _src.ODE_UY import ODE_UY +from _src.ODE_UYC import ODE_UYC +from _src.ODE_st import ODE_st +from _src.ODE_t import ODE_t from _src.poly import Poly -#from _src.ODE_UYC import ODE_UYC ADD THIS FILE TO THE REPO!! -#from _src.ODE_st import ODE_st + +from _src.trunclinear import TruncLinear,TruncLinear_inf +from _src.splitKern import SplitKern,DiffGenomeKern + # TODO: put this in an init file somewhere #I'm commenting this out because the files were not added. JH. Remember to add the files before commiting try: diff --git a/GPy/kern/_src/ODE_UYC.py b/GPy/kern/_src/ODE_UYC.py new file mode 100644 index 00000000..1722d2e1 --- /dev/null +++ b/GPy/kern/_src/ODE_UYC.py @@ -0,0 +1,290 @@ +# Copyright (c) 2013, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +from kern import Kern +from ...core.parameterization import Param +from ...core.parameterization.transformations import Logexp +import numpy as np +from independent_outputs import index_to_slices + +class ODE_UYC(Kern): + def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., ubias =1. ,active_dims=None, name='ode_uyc'): + assert input_dim ==2, "only defined for 2 input dims" + super(ODE_UYC, self).__init__(input_dim, active_dims, name) + + self.variance_Y = Param('variance_Y', variance_Y, Logexp()) + self.variance_U = Param('variance_U', variance_U, Logexp()) + self.lengthscale_Y = Param('lengthscale_Y', lengthscale_Y, Logexp()) + self.lengthscale_U = Param('lengthscale_U', lengthscale_U, Logexp()) + self.ubias = Param('ubias', ubias, Logexp()) + + self.add_parameters(self.variance_Y, self.variance_U, self.lengthscale_Y, self.lengthscale_U, self.ubias) + + def K(self, X, X2=None): + # model : a * dy/dt + b * y = U + #lu=sqrt(3)/theta1 ly=1/theta2 theta2= a/b :thetay sigma2=1/(2ab) :sigmay + + X,slices = X[:,:-1],index_to_slices(X[:,-1]) + if X2 is None: + X2,slices2 = X,slices + K = np.zeros((X.shape[0], X.shape[0])) + else: + X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1]) + K = np.zeros((X.shape[0], X2.shape[0])) + + #stop + #rdist = X[:,0][:,None] - X2[:,0][:,None].T + rdist = X - X2.T + ly=1/self.lengthscale_Y + lu=np.sqrt(3)/self.lengthscale_U + #iu=self.input_lengthU #dimention of U + Vu=self.variance_U + Vy=self.variance_Y + #Vy=ly/2 + #stop + + + # kernel for kuu matern3/2 + kuu = lambda dist:Vu * (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist)) +self.ubias + + # kernel for kyy + k1 = lambda dist:np.exp(-ly*np.abs(dist))*(2*lu+ly)/(lu+ly)**2 + k2 = lambda dist:(np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 + k3 = lambda dist:np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 ) + kyy = lambda dist:Vu*Vy*(k1(dist) + k2(dist) + k3(dist)) + + + # cross covariance function + kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly))) + #kyu3 = lambda dist: 0 + + k1cros = lambda dist:np.exp(ly*dist)/(lu-ly) * ( 1- np.exp( (lu-ly)*dist) + lu* ( dist*np.exp( (lu-ly)*dist ) + (1- np.exp( (lu-ly)*dist ) ) /(lu-ly) ) ) + #k1cros = lambda dist:0 + + k2cros = lambda dist:np.exp(ly*dist)*( 1/(lu+ly) + lu/(lu+ly)**2 ) + #k2cros = lambda dist:0 + + Vyu=np.sqrt(Vy*ly*2) + + # cross covariance kuy + kuyp = lambda dist:Vu*Vyu*(kyu3(dist)) #t>0 kuy + kuyn = lambda dist:Vu*Vyu*(k1cros(dist)+k2cros(dist)) #t<0 kuy + # cross covariance kyu + kyup = lambda dist:Vu*Vyu*(k1cros(-dist)+k2cros(-dist)) #t>0 kyu + kyun = lambda dist:Vu*Vyu*(kyu3(-dist)) #t<0 kyu + + + for i, s1 in enumerate(slices): + for j, s2 in enumerate(slices2): + for ss1 in s1: + for ss2 in s2: + if i==0 and j==0: + K[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2])) + elif i==0 and j==1: + #K[ss1,ss2]= np.where( rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[ss1,ss2]) ) ) + K[ss1,ss2]= np.where( rdist[ss1,ss2]>0 , kuyp(rdist[ss1,ss2]), kuyn(rdist[ss1,ss2] ) ) + elif i==1 and j==1: + K[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2])) + else: + #K[ss1,ss2]= 0 + #K[ss1,ss2]= np.where( rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[ss1,ss2]) ) ) + K[ss1,ss2] = np.where( rdist[ss1,ss2]>0 , kyup(rdist[ss1,ss2]), kyun(rdist[ss1,ss2] ) ) + return K + + + + def Kdiag(self, X): + """Compute the diagonal of the covariance matrix associated to X.""" + Kdiag = np.zeros(X.shape[0]) + ly=1/self.lengthscale_Y + lu=np.sqrt(3)/self.lengthscale_U + + Vu = self.variance_U + Vy=self.variance_Y + + k1 = (2*lu+ly)/(lu+ly)**2 + k2 = (ly-2*lu + 2*lu-ly ) / (ly-lu)**2 + k3 = 1/(lu+ly) + (lu)/(lu+ly)**2 + + slices = index_to_slices(X[:,-1]) + + for i, ss1 in enumerate(slices): + for s1 in ss1: + if i==0: + Kdiag[s1]+= self.variance_U + self.ubias + elif i==1: + Kdiag[s1]+= Vu*Vy*(k1+k2+k3) + else: + raise ValueError, "invalid input/output index" + #Kdiag[slices[0][0]]+= self.variance_U #matern32 diag + #Kdiag[slices[1][0]]+= self.variance_U*self.variance_Y*(k1+k2+k3) # diag + return Kdiag + + + def update_gradients_full(self, dL_dK, X, X2=None): + """derivative of the covariance matrix with respect to the parameters.""" + X,slices = X[:,:-1],index_to_slices(X[:,-1]) + if X2 is None: + X2,slices2 = X,slices + else: + X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1]) + #rdist = X[:,0][:,None] - X2[:,0][:,None].T + + rdist = X - X2.T + ly=1/self.lengthscale_Y + lu=np.sqrt(3)/self.lengthscale_U + + Vu=self.variance_U + Vy=self.variance_Y + Vyu = np.sqrt(Vy*ly*2) + dVdly = 0.5/np.sqrt(ly)*np.sqrt(2*Vy) + dVdVy = 0.5/np.sqrt(Vy)*np.sqrt(2*ly) + + rd=rdist.shape[0] + dktheta1 = np.zeros([rd,rd]) + dktheta2 = np.zeros([rd,rd]) + dkUdvar = np.zeros([rd,rd]) + dkYdvar = np.zeros([rd,rd]) + + dkdubias = np.zeros([rd,rd]) + + # dk dtheta for UU + UUdtheta1 = lambda dist: np.exp(-lu* dist)*dist + (-dist)*np.exp(-lu* dist)*(1+lu*dist) + UUdtheta2 = lambda dist: 0 + #UUdvar = lambda dist: (1 + lu*dist)*np.exp(-lu*dist) + UUdvar = lambda dist: (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist)) + + # dk dtheta for YY + + dk1theta1 = lambda dist: np.exp(-ly*dist)*2*(-lu)/(lu+ly)**3 + + dk2theta1 = lambda dist: (1.0)*( + np.exp(-lu*dist)*dist*(-ly+2*lu-lu*ly*dist+dist*lu**2)*(ly-lu)**(-2) + np.exp(-lu*dist)*(-2+ly*dist-2*dist*lu)*(ly-lu)**(-2) + +np.exp(-dist*lu)*(ly-2*lu+ly*lu*dist-dist*lu**2)*2*(ly-lu)**(-3) + +np.exp(-dist*ly)*2*(ly-lu)**(-2) + +np.exp(-dist*ly)*2*(2*lu-ly)*(ly-lu)**(-3) + ) + + dk3theta1 = lambda dist: np.exp(-dist*lu)*(lu+ly)**(-2)*((2*lu+ly+dist*lu**2+lu*ly*dist)*(-dist-2/(lu+ly))+2+2*lu*dist+ly*dist) + + #dktheta1 = lambda dist: self.variance_U*self.variance_Y*(dk1theta1+dk2theta1+dk3theta1) + + + + + dk1theta2 = lambda dist: np.exp(-ly*dist) * ((lu+ly)**(-2)) * ( (-dist)*(2*lu+ly) + 1 + (-2)*(2*lu+ly)/(lu+ly) ) + + dk2theta2 =lambda dist: 1*( + np.exp(-dist*lu)*(ly-lu)**(-2) * ( 1+lu*dist+(-2)*(ly-2*lu+lu*ly*dist-dist*lu**2)*(ly-lu)**(-1) ) + +np.exp(-dist*ly)*(ly-lu)**(-2) * ( (-dist)*(2*lu-ly) -1+(2*lu-ly)*(-2)*(ly-lu)**(-1) ) + ) + + dk3theta2 = lambda dist: np.exp(-dist*lu) * (-3*lu-ly-dist*lu**2-lu*ly*dist)/(lu+ly)**3 + + #dktheta2 = lambda dist: self.variance_U*self.variance_Y*(dk1theta2 + dk2theta2 +dk3theta2) + + # kyy kernel + + k1 = lambda dist: np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2 + k2 = lambda dist: (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 + k3 = lambda dist: np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 ) + #dkdvar = k1+k2+k3 + + + + # cross covariance function + kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly))) + + k1cros = lambda dist:np.exp(ly*dist)/(lu-ly) * ( 1- np.exp( (lu-ly)*dist) + lu* ( dist*np.exp( (lu-ly)*dist ) + (1- np.exp( (lu-ly)*dist ) ) /(lu-ly) ) ) + + k2cros = lambda dist:np.exp(ly*dist)*( 1/(lu+ly) + lu/(lu+ly)**2 ) + # cross covariance kuy + kuyp = lambda dist:(kyu3(dist)) #t>0 kuy + kuyn = lambda dist:(k1cros(dist)+k2cros(dist)) #t<0 kuy + # cross covariance kyu + kyup = lambda dist:(k1cros(-dist)+k2cros(-dist)) #t>0 kyu + kyun = lambda dist:(kyu3(-dist)) #t<0 kyu + + # dk dtheta for UY + + + dkyu3dtheta2 = lambda dist: np.exp(-lu*dist) * ( (-1)*(lu+ly)**(-2)*(1+lu*dist+lu*(lu+ly)**(-1)) + (lu+ly)**(-1)*(-lu)*(lu+ly)**(-2) ) + dkyu3dtheta1 = lambda dist: np.exp(-lu*dist)*(lu+ly)**(-1)* ( (-dist)*(1+dist*lu+lu*(lu+ly)**(-1)) -\ + (lu+ly)**(-1)*(1+dist*lu+lu*(lu+ly)**(-1)) +dist+(lu+ly)**(-1)-lu*(lu+ly)**(-2) ) + + dkcros2dtheta1 = lambda dist: np.exp(ly*dist)* ( -(ly+lu)**(-2) + (ly+lu)**(-2) + (-2)*lu*(lu+ly)**(-3) ) + dkcros2dtheta2 = lambda dist: np.exp(ly*dist)*dist* ( (ly+lu)**(-1) + lu*(lu+ly)**(-2) ) + \ + np.exp(ly*dist)*( -(lu+ly)**(-2) + lu*(-2)*(lu+ly)**(-3) ) + + dkcros1dtheta1 = lambda dist: np.exp(ly*dist)*( -(lu-ly)**(-2)*( 1-np.exp((lu-ly)*dist) + lu*dist*np.exp((lu-ly)*dist)+ \ + lu*(1-np.exp((lu-ly)*dist))/(lu-ly) ) + (lu-ly)**(-1)*( -np.exp( (lu-ly)*dist )*dist + dist*np.exp( (lu-ly)*dist)+\ + lu*dist**2*np.exp((lu-ly)*dist)+(1-np.exp((lu-ly)*dist))/(lu-ly) - lu*np.exp((lu-ly)*dist)*dist/(lu-ly) -\ + lu*(1-np.exp((lu-ly)*dist))/(lu-ly)**2 ) ) + + dkcros1dtheta2 = lambda t: np.exp(ly*t)*t/(lu-ly)*( 1-np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t)+\ + lu*(1-np.exp((lu-ly)*t))/(lu-ly) )+\ + np.exp(ly*t)/(lu-ly)**2* ( 1-np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t) + lu*( 1-np.exp((lu-ly)*t) )/(lu-ly) )+\ + np.exp(ly*t)/(lu-ly)*( np.exp((lu-ly)*t)*t -lu*t*t*np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t)/(lu-ly)+\ + lu*( 1-np.exp((lu-ly)*t) )/(lu-ly)**2 ) + + dkuypdtheta1 = lambda dist:(dkyu3dtheta1(dist)) #t>0 kuy + dkuyndtheta1 = lambda dist:(dkcros1dtheta1(dist)+dkcros2dtheta1(dist)) #t<0 kuy + # cross covariance kyu + dkyupdtheta1 = lambda dist:(dkcros1dtheta1(-dist)+dkcros2dtheta1(-dist)) #t>0 kyu + dkyundtheta1 = lambda dist:(dkyu3dtheta1(-dist)) #t<0 kyu + + dkuypdtheta2 = lambda dist:(dkyu3dtheta2(dist)) #t>0 kuy + dkuyndtheta2 = lambda dist:(dkcros1dtheta2(dist)+dkcros2dtheta2(dist)) #t<0 kuy + # cross covariance kyu + dkyupdtheta2 = lambda dist:(dkcros1dtheta2(-dist)+dkcros2dtheta2(-dist)) #t>0 kyu + dkyundtheta2 = lambda dist:(dkyu3dtheta2(-dist)) #t<0 kyu + + + for i, s1 in enumerate(slices): + for j, s2 in enumerate(slices2): + for ss1 in s1: + for ss2 in s2: + if i==0 and j==0: + #target[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2])) + dktheta1[ss1,ss2] = Vu*UUdtheta1(np.abs(rdist[ss1,ss2])) + dktheta2[ss1,ss2] = 0 + dkUdvar[ss1,ss2] = UUdvar(np.abs(rdist[ss1,ss2])) + dkYdvar[ss1,ss2] = 0 + dkdubias[ss1,ss2] = 1 + elif i==0 and j==1: + ########target[ss1,ss2] = np.where( rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) ) ) + #np.where( rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) ) ) + #dktheta1[ss1,ss2] = np.where( rdist[ss1,ss2]>0 , self.variance_U*self.variance_Y*dkcrtheta1(np.abs(rdist[ss1,ss2])) ,self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))) ) + #dktheta2[ss1,ss2] = np.where( rdist[ss1,ss2]>0 , self.variance_U*self.variance_Y*dkcrtheta2(np.abs(rdist[ss1,ss2])) ,self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2]))) ) + dktheta1[ss1,ss2] = np.where( rdist[ss1,ss2]>0 , Vu*Vyu*dkuypdtheta1(rdist[ss1,ss2]),Vu*Vyu*dkuyndtheta1(rdist[ss1,ss2]) ) + dkUdvar[ss1,ss2] = np.where( rdist[ss1,ss2]>0 , Vyu*kuyp(rdist[ss1,ss2]), Vyu* kuyn(rdist[ss1,ss2]) ) + dktheta2[ss1,ss2] = np.where( rdist[ss1,ss2]>0 , Vu*Vyu*dkuypdtheta2(rdist[ss1,ss2])+Vu*dVdly*kuyp(rdist[ss1,ss2]),Vu*Vyu*dkuyndtheta2(rdist[ss1,ss2])+Vu*dVdly*kuyn(rdist[ss1,ss2]) ) + dkYdvar[ss1,ss2] = np.where( rdist[ss1,ss2]>0 , Vu*dVdVy*kuyp(rdist[ss1,ss2]), Vu*dVdVy* kuyn(rdist[ss1,ss2]) ) + dkdubias[ss1,ss2] = 0 + elif i==1 and j==1: + #target[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2])) + dktheta1[ss1,ss2] = self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))+dk3theta1(np.abs(rdist[ss1,ss2]))) + dktheta2[ss1,ss2] = self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2])) + dk2theta2(np.abs(rdist[ss1,ss2])) +dk3theta2(np.abs(rdist[ss1,ss2]))) + dkUdvar[ss1,ss2] = self.variance_Y*(k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) ) + dkYdvar[ss1,ss2] = self.variance_U*(k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) ) + dkdubias[ss1,ss2] = 0 + else: + #######target[ss1,ss2] = np.where( rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[s1[0],s2[0]]) ) ) + #dktheta1[ss1,ss2] = np.where( rdist[ss1,ss2]>0 ,self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))) , self.variance_U*self.variance_Y*dkcrtheta1(np.abs(rdist[ss1,ss2])) ) + #dktheta2[ss1,ss2] = np.where( rdist[ss1,ss2]>0 ,self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2]))) , self.variance_U*self.variance_Y*dkcrtheta2(np.abs(rdist[ss1,ss2])) ) + dktheta1[ss1,ss2] = np.where( rdist[ss1,ss2]>0 , Vu*Vyu*dkyupdtheta1(rdist[ss1,ss2]),Vu*Vyu*dkyundtheta1(rdist[ss1,ss2]) ) + dkUdvar[ss1,ss2] = np.where( rdist[ss1,ss2]>0 , Vyu*kyup(rdist[ss1,ss2]),Vyu*kyun(rdist[ss1,ss2])) + dktheta2[ss1,ss2] = np.where( rdist[ss1,ss2]>0 , Vu*Vyu*dkyupdtheta2(rdist[ss1,ss2])+Vu*dVdly*kyup(rdist[ss1,ss2]),Vu*Vyu*dkyundtheta2(rdist[ss1,ss2])+Vu*dVdly*kyun(rdist[ss1,ss2]) ) + dkYdvar[ss1,ss2] = np.where( rdist[ss1,ss2]>0 , Vu*dVdVy*kyup(rdist[ss1,ss2]), Vu*dVdVy*kyun(rdist[ss1,ss2])) + dkdubias[ss1,ss2] = 0 + #stop + self.variance_U.gradient = np.sum(dkUdvar * dL_dK) # Vu + + self.variance_Y.gradient = np.sum(dkYdvar * dL_dK) # Vy + + self.lengthscale_U.gradient = np.sum(dktheta1*(-np.sqrt(3)*self.lengthscale_U**(-2))* dL_dK) #lu + + self.lengthscale_Y.gradient = np.sum(dktheta2*(-self.lengthscale_Y**(-2)) * dL_dK) #ly + + self.ubias.gradient = np.sum(dkdubias * dL_dK) + diff --git a/GPy/kern/_src/ODE_st.py b/GPy/kern/_src/ODE_st.py new file mode 100644 index 00000000..665be230 --- /dev/null +++ b/GPy/kern/_src/ODE_st.py @@ -0,0 +1,267 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) +from kern import Kern +from ...core.parameterization import Param +from ...core.parameterization.transformations import Logexp +import numpy as np +from independent_outputs import index_to_slices + + +class ODE_st(Kern): + """ + kernel resultiong from a first order ODE with OU driving GP + + :param input_dim: the number of input dimension, has to be equal to one + :type input_dim: int + :param varianceU: variance of the driving GP + :type varianceU: float + :param lengthscaleU: lengthscale of the driving GP (sqrt(3)/lengthscaleU) + :type lengthscaleU: float + :param varianceY: 'variance' of the transfer function + :type varianceY: float + :param lengthscaleY: 'lengthscale' of the transfer function (1/lengthscaleY) + :type lengthscaleY: float + :rtype: kernel object + + """ + + def __init__(self, input_dim, a=1.,b=1., c=1.,variance_Yx=3.,variance_Yt=1.5, lengthscale_Yx=1.5, lengthscale_Yt=1.5, active_dims=None, name='ode_st'): + assert input_dim ==3, "only defined for 3 input dims" + super(ODE_st, self).__init__(input_dim, active_dims, name) + + self.variance_Yt = Param('variance_Yt', variance_Yt, Logexp()) + self.variance_Yx = Param('variance_Yx', variance_Yx, Logexp()) + self.lengthscale_Yt = Param('lengthscale_Yt', lengthscale_Yt, Logexp()) + self.lengthscale_Yx = Param('lengthscale_Yx', lengthscale_Yx, Logexp()) + + self.a= Param('a', a, Logexp()) + self.b = Param('b', b, Logexp()) + self.c = Param('c', c, Logexp()) + + self.add_parameters(self.a, self.b, self.c, self.variance_Yt, self.variance_Yx, self.lengthscale_Yt,self.lengthscale_Yx) + + + def K(self, X, X2=None): + # model : -a d^2y/dx^2 + b dy/dt + c * y = U + # kernel Kyy rbf spatiol temporal + # vyt Y temporal variance vyx Y spatiol variance lyt Y temporal lengthscale lyx Y spatiol lengthscale + # kernel Kuu doper( doper(Kyy)) + # a b c lyt lyx vyx*vyt + """Compute the covariance matrix between X and X2.""" + X,slices = X[:,:-1],index_to_slices(X[:,-1]) + if X2 is None: + X2,slices2 = X,slices + K = np.zeros((X.shape[0], X.shape[0])) + else: + X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1]) + K = np.zeros((X.shape[0], X2.shape[0])) + + + tdist = (X[:,0][:,None] - X2[:,0][None,:])**2 + xdist = (X[:,1][:,None] - X2[:,1][None,:])**2 + + ttdist = (X[:,0][:,None] - X2[:,0][None,:]) + #rdist = [tdist,xdist] + #dist = np.abs(X - X2.T) + vyt = self.variance_Yt + vyx = self.variance_Yx + + lyt=1/(2*self.lengthscale_Yt) + lyx=1/(2*self.lengthscale_Yx) + + a = self.a ## -a is used in the model, negtive diffusion + b = self.b + c = self.c + + kyy = lambda tdist,xdist: np.exp(-lyt*(tdist) -lyx*(xdist)) + + k1 = lambda tdist: (2*lyt - 4*lyt**2 * (tdist) ) + + k2 = lambda xdist: ( 4*lyx**2 * (xdist) - 2*lyx ) + + k3 = lambda xdist: ( 3*4*lyx**2 - 6*8*xdist*lyx**3 + 16*xdist**2*lyx**4 ) + + k4 = lambda ttdist: 2*lyt*(ttdist) + + for i, s1 in enumerate(slices): + for j, s2 in enumerate(slices2): + for ss1 in s1: + for ss2 in s2: + if i==0 and j==0: + K[ss1,ss2] = vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + elif i==0 and j==1: + K[ss1,ss2] = (-a*k2(xdist[ss1,ss2]) + b*k4(ttdist[ss1,ss2]) + c)*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + #K[ss1,ss2]= np.where( rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[ss1,ss2]) ) ) + #K[ss1,ss2]= np.where( rdist[ss1,ss2]>0 , kuyp(rdist[ss1,ss2]), kuyn(rdist[ss1,ss2] ) ) + elif i==1 and j==1: + K[ss1,ss2] = ( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 )* vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + else: + K[ss1,ss2] = (-a*k2(xdist[ss1,ss2]) - b*k4(ttdist[ss1,ss2]) + c)*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + #K[ss1,ss2]= np.where( rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[ss1,ss2]) ) ) + #K[ss1,ss2] = np.where( rdist[ss1,ss2]>0 , kyup(rdist[ss1,ss2]), kyun(rdist[ss1,ss2] ) ) + + #stop + return K + + def Kdiag(self, X): + """Compute the diagonal of the covariance matrix associated to X.""" + vyt = self.variance_Yt + vyx = self.variance_Yx + + lyt = 1./(2*self.lengthscale_Yt) + lyx = 1./(2*self.lengthscale_Yx) + + a = self.a + b = self.b + c = self.c + + ## dk^2/dtdt' + k1 = (2*lyt )*vyt*vyx + ## dk^2/dx^2 + k2 = ( - 2*lyx )*vyt*vyx + ## dk^4/dx^2dx'^2 + k3 = ( 4*3*lyx**2 )*vyt*vyx + + + Kdiag = np.zeros(X.shape[0]) + slices = index_to_slices(X[:,-1]) + + for i, ss1 in enumerate(slices): + for s1 in ss1: + if i==0: + Kdiag[s1]+= vyt*vyx + elif i==1: + #i=1 + Kdiag[s1]+= b**2*k1 - 2*a*c*k2 + a**2*k3 + c**2*vyt*vyx + #Kdiag[s1]+= Vu*Vy*(k1+k2+k3) + else: + raise ValueError, "invalid input/output index" + + return Kdiag + + + def update_gradients_full(self, dL_dK, X, X2=None): + #def dK_dtheta(self, dL_dK, X, X2, target): + """derivative of the covariance matrix with respect to the parameters.""" + X,slices = X[:,:-1],index_to_slices(X[:,-1]) + if X2 is None: + X2,slices2 = X,slices + K = np.zeros((X.shape[0], X.shape[0])) + else: + X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1]) + + vyt = self.variance_Yt + vyx = self.variance_Yx + + lyt = 1./(2*self.lengthscale_Yt) + lyx = 1./(2*self.lengthscale_Yx) + + a = self.a + b = self.b + c = self.c + + tdist = (X[:,0][:,None] - X2[:,0][None,:])**2 + xdist = (X[:,1][:,None] - X2[:,1][None,:])**2 + #rdist = [tdist,xdist] + ttdist = (X[:,0][:,None] - X2[:,0][None,:]) + + rd=tdist.shape[0] + + dka = np.zeros([rd,rd]) + dkb = np.zeros([rd,rd]) + dkc = np.zeros([rd,rd]) + dkYdvart = np.zeros([rd,rd]) + dkYdvarx = np.zeros([rd,rd]) + dkYdlent = np.zeros([rd,rd]) + dkYdlenx = np.zeros([rd,rd]) + + + kyy = lambda tdist,xdist: np.exp(-lyt*(tdist) -lyx*(xdist)) + #k1 = lambda tdist: (lyt - lyt**2 * (tdist) ) + #k2 = lambda xdist: ( lyx**2 * (xdist) - lyx ) + #k3 = lambda xdist: ( 3*lyx**2 - 6*xdist*lyx**3 + xdist**2*lyx**4 ) + #k4 = lambda tdist: -lyt*np.sqrt(tdist) + + k1 = lambda tdist: (2*lyt - 4*lyt**2 * (tdist) ) + + k2 = lambda xdist: ( 4*lyx**2 * (xdist) - 2*lyx ) + + k3 = lambda xdist: ( 3*4*lyx**2 - 6*8*xdist*lyx**3 + 16*xdist**2*lyx**4 ) + + k4 = lambda ttdist: 2*lyt*(ttdist) + + dkyydlyx = lambda tdist,xdist: kyy(tdist,xdist)*(-xdist) + dkyydlyt = lambda tdist,xdist: kyy(tdist,xdist)*(-tdist) + + dk1dlyt = lambda tdist: 2. - 4*2.*lyt*tdist + dk2dlyx = lambda xdist: (4.*2.*lyx*xdist -2.) + dk3dlyx = lambda xdist: (6.*4.*lyx - 18.*8*xdist*lyx**2 + 4*16*xdist**2*lyx**3) + + dk4dlyt = lambda ttdist: 2*(ttdist) + + for i, s1 in enumerate(slices): + for j, s2 in enumerate(slices2): + for ss1 in s1: + for ss2 in s2: + if i==0 and j==0: + dka[ss1,ss2] = 0 + dkb[ss1,ss2] = 0 + dkc[ss1,ss2] = 0 + dkYdvart[ss1,ss2] = vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkYdvarx[ss1,ss2] = vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2]) + dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2]) + elif i==0 and j==1: + dka[ss1,ss2] = -k2(xdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkb[ss1,ss2] = k4(ttdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkc[ss1,ss2] = vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + #dkYdvart[ss1,ss2] = 0 + #dkYdvarx[ss1,ss2] = 0 + #dkYdlent[ss1,ss2] = 0 + #dkYdlenx[ss1,ss2] = 0 + dkYdvart[ss1,ss2] = (-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkYdvarx[ss1,ss2] = (-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])* (-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)+\ + vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*b*dk4dlyt(ttdist[ss1,ss2]) + dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)+\ + vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*dk2dlyx(xdist[ss1,ss2])) + elif i==1 and j==1: + dka[ss1,ss2] = (2*a*k3(xdist[ss1,ss2]) - 2*c*k2(xdist[ss1,ss2]))*vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkb[ss1,ss2] = 2*b*k1(tdist[ss1,ss2])*vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkc[ss1,ss2] = (-2*a*k2(xdist[ss1,ss2]) + 2*c )*vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkYdvart[ss1,ss2] = ( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 )*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkYdvarx[ss1,ss2] = ( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 )*vyt* kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])*( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 ) +\ + vyx*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*b**2*dk1dlyt(tdist[ss1,ss2]) + dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])*( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 ) +\ + vyx*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])* (-2*a*c*dk2dlyx(xdist[ss1,ss2]) + a**2*dk3dlyx(xdist[ss1,ss2]) ) + else: + dka[ss1,ss2] = -k2(xdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkb[ss1,ss2] = -k4(ttdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkc[ss1,ss2] = vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + #dkYdvart[ss1,ss2] = 0 + #dkYdvarx[ss1,ss2] = 0 + #dkYdlent[ss1,ss2] = 0 + #dkYdlenx[ss1,ss2] = 0 + dkYdvart[ss1,ss2] = (-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkYdvarx[ss1,ss2] = (-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2]) + dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])* (-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)+\ + vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*(-1)*b*dk4dlyt(ttdist[ss1,ss2]) + dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)+\ + vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*dk2dlyx(xdist[ss1,ss2])) + + self.a.gradient = np.sum(dka * dL_dK) + + self.b.gradient = np.sum(dkb * dL_dK) + + self.c.gradient = np.sum(dkc * dL_dK) + + + self.variance_Yt.gradient = np.sum(dkYdvart * dL_dK) # Vy + + self.variance_Yx.gradient = np.sum(dkYdvarx * dL_dK) + + self.lengthscale_Yt.gradient = np.sum(dkYdlent*(-0.5*self.lengthscale_Yt**(-2)) * dL_dK) #ly np.sum(dktheta2*(-self.lengthscale_Y**(-2)) * dL_dK) + + self.lengthscale_Yx.gradient = np.sum(dkYdlenx*(-0.5*self.lengthscale_Yx**(-2)) * dL_dK) + diff --git a/GPy/kern/_src/ODE_t.py b/GPy/kern/_src/ODE_t.py new file mode 100644 index 00000000..a470cbec --- /dev/null +++ b/GPy/kern/_src/ODE_t.py @@ -0,0 +1,165 @@ +from kern import Kern +from ...core.parameterization import Param +from ...core.parameterization.transformations import Logexp +import numpy as np +from independent_outputs import index_to_slices + + +class ODE_t(Kern): + + def __init__(self, input_dim, a=1., c=1.,variance_Yt=3., lengthscale_Yt=1.5,ubias =1., active_dims=None, name='ode_st'): + assert input_dim ==2, "only defined for 2 input dims" + super(ODE_t, self).__init__(input_dim, active_dims, name) + + self.variance_Yt = Param('variance_Yt', variance_Yt, Logexp()) + self.lengthscale_Yt = Param('lengthscale_Yt', lengthscale_Yt, Logexp()) + + self.a= Param('a', a, Logexp()) + self.c = Param('c', c, Logexp()) + self.ubias = Param('ubias', ubias, Logexp()) + self.add_parameters(self.a, self.c, self.variance_Yt, self.lengthscale_Yt,self.ubias) + + def K(self, X, X2=None): + """Compute the covariance matrix between X and X2.""" + X,slices = X[:,:-1],index_to_slices(X[:,-1]) + if X2 is None: + X2,slices2 = X,slices + K = np.zeros((X.shape[0], X.shape[0])) + else: + X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1]) + K = np.zeros((X.shape[0], X2.shape[0])) + + tdist = (X[:,0][:,None] - X2[:,0][None,:])**2 + ttdist = (X[:,0][:,None] - X2[:,0][None,:]) + + vyt = self.variance_Yt + + lyt=1/(2*self.lengthscale_Yt) + + a = -self.a + c = self.c + + kyy = lambda tdist: np.exp(-lyt*(tdist)) + + k1 = lambda tdist: (2*lyt - 4*lyt**2 *(tdist) ) + + k4 = lambda tdist: 2*lyt*(tdist) + + for i, s1 in enumerate(slices): + for j, s2 in enumerate(slices2): + for ss1 in s1: + for ss2 in s2: + if i==0 and j==0: + K[ss1,ss2] = vyt*kyy(tdist[ss1,ss2]) + elif i==0 and j==1: + K[ss1,ss2] = (k4(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2]) + #K[ss1,ss2] = (2*lyt*(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2]) + elif i==1 and j==1: + K[ss1,ss2] = ( k1(tdist[ss1,ss2]) + 1. )*vyt* kyy(tdist[ss1,ss2])+self.ubias + else: + K[ss1,ss2] = (-k4(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2]) + #K[ss1,ss2] = (-2*lyt*(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2]) + #stop + return K + + + def Kdiag(self, X): + + vyt = self.variance_Yt + lyt = 1./(2*self.lengthscale_Yt) + + a = -self.a + c = self.c + + k1 = (2*lyt )*vyt + + Kdiag = np.zeros(X.shape[0]) + slices = index_to_slices(X[:,-1]) + + for i, ss1 in enumerate(slices): + for s1 in ss1: + if i==0: + Kdiag[s1]+= vyt + elif i==1: + #i=1 + Kdiag[s1]+= k1 + vyt+self.ubias + #Kdiag[s1]+= Vu*Vy*(k1+k2+k3) + else: + raise ValueError, "invalid input/output index" + + return Kdiag + + def update_gradients_full(self, dL_dK, X, X2=None): + """derivative of the covariance matrix with respect to the parameters.""" + X,slices = X[:,:-1],index_to_slices(X[:,-1]) + if X2 is None: + X2,slices2 = X,slices + K = np.zeros((X.shape[0], X.shape[0])) + else: + X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1]) + + + vyt = self.variance_Yt + + lyt = 1./(2*self.lengthscale_Yt) + + tdist = (X[:,0][:,None] - X2[:,0][None,:])**2 + ttdist = (X[:,0][:,None] - X2[:,0][None,:]) + #rdist = [tdist,xdist] + + rd=tdist.shape[0] + + dka = np.zeros([rd,rd]) + dkc = np.zeros([rd,rd]) + dkYdvart = np.zeros([rd,rd]) + dkYdlent = np.zeros([rd,rd]) + + dkdubias = np.zeros([rd,rd]) + + kyy = lambda tdist: np.exp(-lyt*(tdist)) + dkyydlyt = lambda tdist: kyy(tdist)*(-tdist) + + k1 = lambda tdist: (2*lyt - 4*lyt**2 * (tdist) ) + + k4 = lambda ttdist: 2*lyt*(ttdist) + + dk1dlyt = lambda tdist: 2. - 4*2.*lyt*tdist + + dk4dlyt = lambda ttdist: 2*(ttdist) + + for i, s1 in enumerate(slices): + for j, s2 in enumerate(slices2): + for ss1 in s1: + for ss2 in s2: + if i==0 and j==0: + dkYdvart[ss1,ss2] = kyy(tdist[ss1,ss2]) + dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2]) + dkdubias[ss1,ss2] = 0 + elif i==0 and j==1: + dkYdvart[ss1,ss2] = (k4(ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2]) + #dkYdvart[ss1,ss2] = ((2*lyt*ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2]) + dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (k4(ttdist[ss1,ss2])+1.)+\ + vyt*kyy(tdist[ss1,ss2])*(dk4dlyt(ttdist[ss1,ss2])) + #dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (2*lyt*(ttdist[ss1,ss2])+1.)+\ + #vyt*kyy(tdist[ss1,ss2])*(2*ttdist[ss1,ss2]) + dkdubias[ss1,ss2] = 0 + elif i==1 and j==1: + dkYdvart[ss1,ss2] = (k1(tdist[ss1,ss2]) + 1. )* kyy(tdist[ss1,ss2]) + dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])*( k1(tdist[ss1,ss2]) + 1. ) +\ + vyt*kyy(tdist[ss1,ss2])*dk1dlyt(tdist[ss1,ss2]) + dkdubias[ss1,ss2] = 1 + else: + dkYdvart[ss1,ss2] = (-k4(ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2]) + #dkYdvart[ss1,ss2] = (-2*lyt*(ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2]) + dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (-k4(ttdist[ss1,ss2])+1.)+\ + vyt*kyy(tdist[ss1,ss2])*(-dk4dlyt(ttdist[ss1,ss2]) ) + dkdubias[ss1,ss2] = 0 + #dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (-2*lyt*(ttdist[ss1,ss2])+1.)+\ + #vyt*kyy(tdist[ss1,ss2])*(-2)*(ttdist[ss1,ss2]) + + + self.variance_Yt.gradient = np.sum(dkYdvart * dL_dK) + + self.lengthscale_Yt.gradient = np.sum(dkYdlent*(-0.5*self.lengthscale_Yt**(-2)) * dL_dK) + + self.ubias.gradient = np.sum(dkdubias * dL_dK) \ No newline at end of file diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py index 88b8e40c..27f8ebd1 100644 --- a/GPy/kern/_src/add.py +++ b/GPy/kern/_src/add.py @@ -10,10 +10,17 @@ class Add(CombinationKernel): """ Add given list of kernels together. propagates gradients through. - + This kernel will take over the active dims of it's subkernels passed in. """ def __init__(self, subkerns, name='add'): + for i, kern in enumerate(subkerns[:]): + if isinstance(kern, Add): + del subkerns[i] + for part in kern.parts[::-1]: + kern.remove_parameter(part) + subkerns.insert(i, part) + super(Add, self).__init__(subkerns, name) @Cache_this(limit=2, force_kwargs=['which_parts']) @@ -40,7 +47,7 @@ class Add(CombinationKernel): return reduce(np.add, (p.Kdiag(X) for p in which_parts)) def update_gradients_full(self, dL_dK, X, X2=None): - [p.update_gradients_full(dL_dK, X, X2) for p in self.parts] + [p.update_gradients_full(dL_dK, X, X2) for p in self.parts if not p.is_fixed] def update_gradients_diag(self, dL_dK, X): [p.update_gradients_diag(dL_dK, X) for p in self.parts] @@ -63,13 +70,16 @@ class Add(CombinationKernel): target = np.zeros(X.shape) [target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts] return target - + + @Cache_this(limit=2, force_kwargs=['which_parts']) def psi0(self, Z, variational_posterior): return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts)) - + + @Cache_this(limit=2, force_kwargs=['which_parts']) def psi1(self, Z, variational_posterior): return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts)) + @Cache_this(limit=2, force_kwargs=['which_parts']) def psi2(self, Z, variational_posterior): psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts)) #return psi2 @@ -88,17 +98,18 @@ class Add(CombinationKernel): # rbf X bias #elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)): elif isinstance(p1, Bias) and isinstance(p2, (RBF, Linear)): - tmp = p2.psi1(Z, variational_posterior) - psi2 += p1.variance * (tmp[:, :, None] + tmp[:, None, :]) + tmp = p2.psi1(Z, variational_posterior).sum(axis=0) + psi2 += p1.variance * (tmp[:,None]+tmp[None,:]) #(tmp[:, :, None] + tmp[:, None, :]) #elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)): elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)): - tmp = p1.psi1(Z, variational_posterior) - psi2 += p2.variance * (tmp[:, :, None] + tmp[:, None, :]) + tmp = p1.psi1(Z, variational_posterior).sum(axis=0) + psi2 += p2.variance * (tmp[:,None]+tmp[None,:]) #(tmp[:, :, None] + tmp[:, None, :]) elif isinstance(p2, (RBF, Linear)) and isinstance(p1, (RBF, Linear)): assert np.intersect1d(p1.active_dims, p2.active_dims).size == 0, "only non overlapping kernel dimensions allowed so far" tmp1 = p1.psi1(Z, variational_posterior) tmp2 = p2.psi1(Z, variational_posterior) - psi2 += (tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :]) + psi2 += np.einsum('nm,no->mo',tmp1,tmp2)+np.einsum('nm,no->mo',tmp2,tmp1) + #(tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :]) else: raise NotImplementedError, "psi2 cannot be computed for this kernel" return psi2 @@ -114,12 +125,12 @@ class Add(CombinationKernel): if isinstance(p2, White): continue elif isinstance(p2, Bias): - eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2. + eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2. else:# np.setdiff1d(p1.active_dims, ar2, assume_unique): # TODO: Careful, not correct for overlapping active_dims - eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2. + eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2. p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior) - def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): from static import White, Bias target = np.zeros(Z.shape) for p1 in self.parts: @@ -131,36 +142,34 @@ class Add(CombinationKernel): if isinstance(p2, White): continue elif isinstance(p2, Bias): - eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2. + eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2. else: - eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2. - target += p1.gradients_Z_expectations(eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior) + eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2. + target += p1.gradients_Z_expectations(dL_psi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior) return target def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): from static import White, Bias - target_mu = np.zeros(variational_posterior.shape) - target_S = np.zeros(variational_posterior.shape) - for p1 in self._parameters_: + target_grads = [np.zeros(v.shape) for v in variational_posterior.parameters] + for p1 in self.parameters: #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2! eff_dL_dpsi1 = dL_dpsi1.copy() - for p2 in self._parameters_: + for p2 in self.parameters: if p2 is p1: continue if isinstance(p2, White): continue elif isinstance(p2, Bias): - eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2. + eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2. else: - eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2. - a, b = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior) - target_mu += a - target_S += b - return target_mu, target_S + eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2. + grads = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior) + [np.add(target_grads[i],grads[i],target_grads[i]) for i in xrange(len(grads))] + return target_grads - def add(self, other, name='sum'): + def add(self, other): if isinstance(other, Add): - other_params = other._parameters_[:] + other_params = other.parameters[:] for p in other_params: other.remove_parameter(p) self.add_parameters(*other_params) @@ -169,8 +178,11 @@ class Add(CombinationKernel): self.input_dim, self.active_dims = self.get_input_dim_active_dims(self.parts) return self - def input_sensitivity(self): - in_sen = np.zeros(self.input_dim) - for i, p in enumerate(self.parts): - in_sen[p.active_dims] += p.input_sensitivity() - return in_sen + def input_sensitivity(self, summarize=True): + if summarize: + return reduce(np.add, [k.input_sensitivity(summarize) for k in self.parts]) + else: + i_s = np.zeros((len(self.parts), self.input_dim)) + from operator import setitem + [setitem(i_s, (i, Ellipsis), k.input_sensitivity(summarize)) for i, k in enumerate(self.parts)] + return i_s diff --git a/GPy/kern/_src/independent_outputs.py b/GPy/kern/_src/independent_outputs.py index 3493cf4f..21958267 100644 --- a/GPy/kern/_src/independent_outputs.py +++ b/GPy/kern/_src/independent_outputs.py @@ -20,9 +20,11 @@ def index_to_slices(index): returns >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]] """ + if len(index)==0: + return[] #contruct the return structure - ind = np.asarray(index,dtype=np.int64) + ind = np.asarray(index,dtype=np.int) ret = [[] for i in range(ind.max()+1)] #find the switchpoints @@ -32,7 +34,7 @@ def index_to_slices(index): [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))] return ret -class IndependentOutputs(Kern): +class IndependentOutputs(CombinationKernel): """ A kernel which can represent several independent functions. this kernel 'switches off' parts of the matrix where the output indexes are different. @@ -180,6 +182,9 @@ class Hierarchical(CombinationKernel): def Kdiag(self,X): return np.diag(self.K(X)) + def gradients_X(self, dL_dK, X, X2=None): + raise NotImplementedError + def update_gradients_full(self,dL_dK,X,X2=None): slices = [index_to_slices(X[:,i]) for i in self.extra_dims] if X2 is None: diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py index 70bd42b9..d8377ffc 100644 --- a/GPy/kern/_src/kern.py +++ b/GPy/kern/_src/kern.py @@ -23,9 +23,9 @@ class Kern(Parameterized): input_dim: - is the number of dimensions to work on. Make sure to give the + is the number of dimensions to work on. Make sure to give the tight dimensionality of inputs. - You most likely want this to be the integer telling the number of + You most likely want this to be the integer telling the number of input dimensions of the kernel. If this is not an integer (!) we will work on the whole input matrix X, and not check whether dimensions match or not (!). @@ -34,40 +34,28 @@ class Kern(Parameterized): is the active_dimensions of inputs X we will work on. All kernels will get sliced Xes as inputs, if active_dims is not None + Only positive integers are allowed in active_dims! if active_dims is None, slicing is switched off and all X will be passed through as given. :param int input_dim: the number of input dimensions to the function - :param array-like|slice|None active_dims: list of indices on which dimensions this kernel works on, or none if no slicing + :param array-like|None active_dims: list of indices on which dimensions this kernel works on, or none if no slicing Do not instantiate. """ super(Kern, self).__init__(name=name, *a, **kw) - try: - self.input_dim = int(input_dim) - self.active_dims = active_dims# if active_dims is not None else slice(0, input_dim, 1) - except TypeError: - # input_dim is something else then an integer - self.input_dim = input_dim - if active_dims is not None: - print "WARNING: given input_dim={} is not an integer and active_dims={} is given, switching off slicing" - self.active_dims = None + self.input_dim = int(input_dim) + + if active_dims is None: + active_dims = np.arange(input_dim) + + self.active_dims = np.array(active_dims, dtype=int) + + assert self.active_dims.size == self.input_dim, "input_dim={} does not match len(active_dim)={}, active_dims={}".format(self.input_dim, self.active_dims.size, self.active_dims) - if self.active_dims is not None and self.input_dim is not None: - assert isinstance(self.active_dims, (slice, list, tuple, np.ndarray)), 'active_dims needs to be an array-like or slice object over dimensions, {} given'.format(self.active_dims.__class__) - if isinstance(self.active_dims, slice): - self.active_dims = slice(self.active_dims.start or 0, self.active_dims.stop or self.input_dim, self.active_dims.step or 1) - active_dim_size = int(np.round((self.active_dims.stop-self.active_dims.start)/self.active_dims.step)) - elif isinstance(self.active_dims, np.ndarray): - #assert np.all(self.active_dims >= 0), 'active dimensions need to be positive. negative indexing is not allowed' - assert self.active_dims.ndim == 1, 'only flat indices allowed, given active_dims.shape={}, provide only indexes to the dimensions (columns) of the input'.format(self.active_dims.shape) - active_dim_size = self.active_dims.size - else: - active_dim_size = len(self.active_dims) - assert active_dim_size == self.input_dim, "input_dim={} does not match len(active_dim)={}, active_dims={}".format(self.input_dim, active_dim_size, self.active_dims) self._sliced_X = 0 self.useGPU = self._support_GPU and useGPU - @Cache_this(limit=10) + @Cache_this(limit=20) def _slice_X(self, X): return X[:, self.active_dims] @@ -115,7 +103,7 @@ class Kern(Parameterized): """ raise NotImplementedError - def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): """ Returns the derivative of the objective wrt Z, using the chain rule through the expectation variables. @@ -146,7 +134,7 @@ class Kern(Parameterized): from ...plotting.matplot_dep import kernel_plots return kernel_plots.plot_ARD(self,*args,**kw) - def input_sensitivity(self): + def input_sensitivity(self, summarize=True): """ Returns the sensitivity for each dimension of this kernel. """ @@ -156,6 +144,9 @@ class Kern(Parameterized): """ Overloading of the '+' operator. for more control, see self.add """ return self.add(other) + def __iadd__(self, other): + return self.add(other) + def add(self, other, name='add'): """ Add another kernel to this one. @@ -176,8 +167,8 @@ class Kern(Parameterized): """ Shortcut for tensor `prod`. """ - assert self.active_dims == range(self.input_dim), "Can only use kernels, which have their input_dims defined from 0" - assert other.active_dims == range(other.input_dim), "Can only use kernels, which have their input_dims defined from 0" + assert np.all(self.active_dims == range(self.input_dim)), "Can only use kernels, which have their input_dims defined from 0" + assert np.all(other.active_dims == range(other.input_dim)), "Can only use kernels, which have their input_dims defined from 0" other.active_dims += self.input_dim return self.prod(other) @@ -195,19 +186,19 @@ class Kern(Parameterized): assert isinstance(other, Kern), "only kernels can be added to kernels..." from prod import Prod #kernels = [] - #if isinstance(self, Prod): kernels.extend(self._parameters_) + #if isinstance(self, Prod): kernels.extend(self.parameters) #else: kernels.append(self) - #if isinstance(other, Prod): kernels.extend(other._parameters_) + #if isinstance(other, Prod): kernels.extend(other.parameters) #else: kernels.append(other) return Prod([self, other], name) def _check_input_dim(self, X): - assert X.shape[1] == self.input_dim, "You did not specify active_dims and X has wrong shape: X_dim={}, whereas input_dim={}".format(X.shape[1], self.input_dim) - - def _check_active_dims(self, X): - assert X.shape[1] >= len(np.r_[self.active_dims]), "At least {} dimensional X needed, X.shape={!s}".format(len(np.r_[self.active_dims]), X.shape) + assert X.shape[1] == self.input_dim, "{} did not specify active_dims and X has wrong shape: X_dim={}, whereas input_dim={}".format(self.name, X.shape[1], self.input_dim) + + def _check_active_dims(self, X): + assert X.shape[1] >= len(self.active_dims), "At least {} dimensional X needed, X.shape={!s}".format(len(self.active_dims), X.shape) + - class CombinationKernel(Kern): """ Abstract super class for combination kernels. @@ -222,9 +213,10 @@ class CombinationKernel(Kern): :param list kernels: List of kernels to combine (can be only one element) :param str name: name of the combination kernel - :param array-like|slice extra_dims: if needed extra dimensions for the combination kernel to work on + :param array-like extra_dims: if needed extra dimensions for the combination kernel to work on """ assert all([isinstance(k, Kern) for k in kernels]) + extra_dims = np.array(extra_dims, dtype=int) input_dim, active_dims = self.get_input_dim_active_dims(kernels, extra_dims) # initialize the kernel with the full input_dim super(CombinationKernel, self).__init__(input_dim, active_dims, name) @@ -233,21 +225,28 @@ class CombinationKernel(Kern): @property def parts(self): - return self._parameters_ + return self.parameters def get_input_dim_active_dims(self, kernels, extra_dims = None): #active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int)) #active_dims = np.array(np.concatenate((active_dims, extra_dims if extra_dims is not None else [])), dtype=int) - input_dim = np.array([k.input_dim for k in kernels]) - if np.all(input_dim[0]==input_dim): - input_dim = input_dim[0] - active_dims = None + input_dim = reduce(max, (k.active_dims.max() for k in kernels)) + 1 + + if extra_dims is not None: + input_dim += extra_dims.size + + active_dims = np.arange(input_dim) return input_dim, active_dims - def input_sensitivity(self): + def input_sensitivity(self, summarize=True): + """ + If summize is true, we want to get the summerized view of the sensitivities, + otherwise put everything into an array with shape (#kernels, input_dim) + in the order of appearance of the kernels in the parameterized object. + """ raise NotImplementedError("Choose the kernel you want to get the sensitivity for. You need to override the default behaviour for getting the input sensitivity to be able to get the input sensitivity. For sum kernel it is the sum of all sensitivities, TODO: product kernel? Other kernels?, also TODO: shall we return all the sensitivities here in the combination kernel? So we can combine them however we want? This could lead to just plot all the sensitivities here...") - def _check_input_dim(self, X): + def _check_active_dims(self, X): return def _check_input_dim(self, X): diff --git a/GPy/kern/_src/kernel_slice_operations.py b/GPy/kern/_src/kernel_slice_operations.py index c1c8d7f1..3473ffce 100644 --- a/GPy/kern/_src/kernel_slice_operations.py +++ b/GPy/kern/_src/kernel_slice_operations.py @@ -124,9 +124,9 @@ def _slice_update_gradients_expectations(f): def _slice_gradients_Z_expectations(f): @wraps(f) - def wrap(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): with _Slice_wrap(self, Z, variational_posterior) as s: - ret = s.handle_return_array(f(self, dL_dpsi1, dL_dpsi2, s.X, s.X2)) + ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2)) return ret return wrap diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py index f9dacf02..9fdacdbb 100644 --- a/GPy/kern/_src/linear.py +++ b/GPy/kern/_src/linear.py @@ -3,15 +3,13 @@ import numpy as np -from scipy import weave from kern import Kern from ...util.linalg import tdot -from ...util.misc import param_to_array from ...core.parameterization import Param from ...core.parameterization.transformations import Logexp from ...util.caching import Cache_this -from ...core.parameterization import variational -from psi_comp import linear_psi_comp +from ...util.config import * +from .psi_comp import PSICOMP_Linear class Linear(Kern): """ @@ -52,6 +50,7 @@ class Linear(Kern): self.variances = Param('variances', variances, Logexp()) self.add_parameter(self.variances) + self.psicomp = PSICOMP_Linear() @Cache_this(limit=2) def K(self, X, X2=None): @@ -77,10 +76,12 @@ class Linear(Kern): def update_gradients_full(self, dL_dK, X, X2=None): if self.ARD: if X2 is None: - self.variances.gradient = np.array([np.sum(dL_dK * tdot(X[:, i:i + 1])) for i in range(self.input_dim)]) + #self.variances.gradient = np.array([np.sum(dL_dK * tdot(X[:, i:i + 1])) for i in range(self.input_dim)]) + self.variances.gradient = np.einsum('ij,iq,jq->q', dL_dK, X, X) else: - product = X[:, None, :] * X2[None, :, :] - self.variances.gradient = (dL_dK[:, :, None] * product).sum(0).sum(0) + #product = X[:, None, :] * X2[None, :, :] + #self.variances.gradient = (dL_dK[:, :, None] * product).sum(0).sum(0) + self.variances.gradient = np.einsum('ij,iq,jq->q', dL_dK, X, X2) else: self.variances.gradient = np.sum(self._dot_product(X, X2) * dL_dK) @@ -94,225 +95,42 @@ class Linear(Kern): def gradients_X(self, dL_dK, X, X2=None): if X2 is None: - return 2.*(((X[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1) + return np.einsum('jq,q,ij->iq', X, 2*self.variances, dL_dK) else: - return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1) + #return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1) + return np.einsum('jq,q,ij->iq', X2, self.variances, dL_dK) def gradients_X_diag(self, dL_dKdiag, X): return 2.*self.variances*dL_dKdiag[:,None]*X + def input_sensitivity(self): + return np.ones(self.input_dim) * self.variances + #---------------------------------------# # PSI statistics # #---------------------------------------# def psi0(self, Z, variational_posterior): - if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - gamma = variational_posterior.binary_prob - mu = variational_posterior.mean - S = variational_posterior.variance - - return np.einsum('q,nq,nq->n',self.variances,gamma,np.square(mu)+S) -# return (self.variances*gamma*(np.square(mu)+S)).sum(axis=1) - else: - return np.sum(self.variances * self._mu2S(variational_posterior), 1) + return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[0] def psi1(self, Z, variational_posterior): - if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - gamma = variational_posterior.binary_prob - mu = variational_posterior.mean - return np.einsum('nq,q,mq,nq->nm',gamma,self.variances,Z,mu) -# return (self.variances*gamma*mu).sum(axis=1) - else: - return self.K(variational_posterior.mean, Z) #the variance, it does nothing + return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[1] - @Cache_this(limit=1) def psi2(self, Z, variational_posterior): - if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - gamma = variational_posterior.binary_prob - mu = variational_posterior.mean - S = variational_posterior.variance - mu2 = np.square(mu) - variances2 = np.square(self.variances) - tmp = np.einsum('nq,q,mq,nq->nm',gamma,self.variances,Z,mu) - return np.einsum('nq,q,mq,oq,nq->nmo',gamma,variances2,Z,Z,mu2+S)+\ - np.einsum('nm,no->nmo',tmp,tmp) - np.einsum('nq,q,mq,oq,nq->nmo',np.square(gamma),variances2,Z,Z,mu2) - else: - ZA = Z * self.variances - ZAinner = self._ZAinner(variational_posterior, Z) - return np.dot(ZAinner, ZA.T) + return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[2] def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): - if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - gamma = variational_posterior.binary_prob - mu = variational_posterior.mean - S = variational_posterior.variance - mu2S = np.square(mu)+S - _dpsi2_dvariance, _, _, _, _ = linear_psi_comp._psi2computations(self.variances, Z, mu, S, gamma) - grad = np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu) +\ - np.einsum('nmo,nmoq->q',dL_dpsi2,_dpsi2_dvariance) - if self.ARD: - self.variances.gradient = grad - else: - self.variances.gradient = grad.sum() + dL_dvar = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[0] + if self.ARD: + self.variances.gradient = dL_dvar else: - #psi1 - self.update_gradients_full(dL_dpsi1, variational_posterior.mean, Z) - # psi0: - tmp = dL_dpsi0[:, None] * self._mu2S(variational_posterior) - if self.ARD: self.variances.gradient += tmp.sum(0) - else: self.variances.gradient += tmp.sum() - #psi2 - if self.ARD: - tmp = dL_dpsi2[:, :, :, None] * (self._ZAinner(variational_posterior, Z)[:, :, None, :] * Z[None, None, :, :]) - self.variances.gradient += 2.*tmp.sum(0).sum(0).sum(0) - else: - self.variances.gradient += 2.*np.sum(dL_dpsi2 * self.psi2(Z, variational_posterior))/self.variances + self.variances.gradient = dL_dvar.sum() - def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): - if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - gamma = variational_posterior.binary_prob - mu = variational_posterior.mean - S = variational_posterior.variance - _, _, _, _, _dpsi2_dZ = linear_psi_comp._psi2computations(self.variances, Z, mu, S, gamma) - - grad = np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, self.variances,mu) +\ - np.einsum('nmo,noq->mq',dL_dpsi2,_dpsi2_dZ) - - return grad - else: - #psi1 - grad = self.gradients_X(dL_dpsi1.T, Z, variational_posterior.mean) - #psi2 - self._weave_dpsi2_dZ(dL_dpsi2, Z, variational_posterior, grad) - return grad + def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[1] def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): - if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - gamma = variational_posterior.binary_prob - mu = variational_posterior.mean - S = variational_posterior.variance - mu2S = np.square(mu)+S - _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _ = linear_psi_comp._psi2computations(self.variances, Z, mu, S, gamma) - - grad_gamma = np.einsum('n,q,nq->nq',dL_dpsi0,self.variances,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,self.variances,Z,mu) +\ - np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dgamma) - grad_mu = np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*self.variances,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,self.variances,Z) +\ - np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dmu) - grad_S = np.einsum('n,nq,q->nq',dL_dpsi0,gamma,self.variances) + np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dS) - - return grad_mu, grad_S, grad_gamma - else: - grad_mu, grad_S = np.zeros(variational_posterior.mean.shape), np.zeros(variational_posterior.mean.shape) - # psi0 - grad_mu += dL_dpsi0[:, None] * (2.0 * variational_posterior.mean * self.variances) - grad_S += dL_dpsi0[:, None] * self.variances - # psi1 - grad_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1) - # psi2 - self._weave_dpsi2_dmuS(dL_dpsi2, Z, variational_posterior, grad_mu, grad_S) - - return grad_mu, grad_S - - #--------------------------------------------------# - # Helpers for psi statistics # - #--------------------------------------------------# - - - def _weave_dpsi2_dmuS(self, dL_dpsi2, Z, vp, target_mu, target_S): - # Think N,num_inducing,num_inducing,input_dim - ZA = Z * self.variances - AZZA = ZA.T[:, None, :, None] * ZA[None, :, None, :] - AZZA = AZZA + AZZA.swapaxes(1, 2) - AZZA_2 = AZZA/2. - - #Using weave, we can exploit the symmetry of this problem: - code = """ - int n, m, mm,q,qq; - double factor,tmp; - #pragma omp parallel for private(m,mm,q,qq,factor,tmp) - for(n=0;n - #include - """ - weave_options = {'headers' : [''], - 'extra_compile_args': ['-fopenmp -O3'], #-march=native'], - 'extra_link_args' : ['-lgomp']} - mu = vp.mean - N,num_inducing,input_dim,mu = mu.shape[0],Z.shape[0],mu.shape[1],param_to_array(mu) - weave.inline(code, support_code=support_code, libraries=['gomp'], - arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'], - type_converters=weave.converters.blitz,**weave_options) - - - def _weave_dpsi2_dZ(self, dL_dpsi2, Z, vp, target): - AZA = self.variances*self._ZAinner(vp, Z) - code=""" - int n,m,mm,q; - #pragma omp parallel for private(n,mm,q) - for(m=0;m - #include - """ - weave_options = {'headers' : [''], - 'extra_compile_args': ['-fopenmp -O3'], #-march=native'], - 'extra_link_args' : ['-lgomp']} - - N,num_inducing,input_dim = vp.mean.shape[0],Z.shape[0],vp.mean.shape[1] - mu = param_to_array(vp.mean) - weave.inline(code, support_code=support_code, libraries=['gomp'], - arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'], - type_converters=weave.converters.blitz,**weave_options) - - - @Cache_this(limit=1, ignore_args=(0,)) - def _mu2S(self, vp): - return np.square(vp.mean) + vp.variance - - @Cache_this(limit=1) - def _ZAinner(self, vp, Z): - ZA = Z*self.variances - inner = (vp.mean[:, None, :] * vp.mean[:, :, None]) - diag_indices = np.diag_indices(vp.mean.shape[1], 2) - inner[:, diag_indices[0], diag_indices[1]] += vp.variance - - return np.dot(ZA, inner).swapaxes(0, 1) # NOTE: self.ZAinner \in [num_inducing x num_data x input_dim]! - - def input_sensitivity(self): - return np.ones(self.input_dim) * self.variances + return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[2:] class LinearFull(Kern): def __init__(self, input_dim, rank, W=None, kappa=None, active_dims=None, name='linear_full'): diff --git a/GPy/kern/_src/periodic.py b/GPy/kern/_src/periodic.py index a8573a05..9f232ab0 100644 --- a/GPy/kern/_src/periodic.py +++ b/GPy/kern/_src/periodic.py @@ -101,6 +101,7 @@ class PeriodicExponential(Periodic): Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None] return(self.lengthscale/(2*self.variance) * Gint + 1./self.variance*np.dot(Flower,Flower.T)) + @silence_errors def update_gradients_full(self, dL_dK, X, X2=None): """derivative of the covariance matrix with respect to the parameters (shape is N x num_inducing x num_params)""" if X2 is None: X2 = X @@ -213,7 +214,7 @@ class PeriodicMatern32(Periodic): return(self.lengthscale**3/(12*np.sqrt(3)*self.variance) * Gint + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscale**2/(3.*self.variance)*np.dot(F1lower,F1lower.T)) - #@silence_errors + @silence_errors def update_gradients_full(self,dL_dK,X,X2): """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)""" if X2 is None: X2 = X diff --git a/GPy/kern/_src/psi_comp/__init__.py b/GPy/kern/_src/psi_comp/__init__.py index 4c0d373d..7a5851fb 100644 --- a/GPy/kern/_src/psi_comp/__init__.py +++ b/GPy/kern/_src/psi_comp/__init__.py @@ -1,2 +1,53 @@ # Copyright (c) 2012, GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) + +from ....core.parameterization.parameter_core import Pickleable +from GPy.util.caching import Cache_this +from ....core.parameterization import variational +import rbf_psi_comp +import ssrbf_psi_comp +import sslinear_psi_comp +import linear_psi_comp + +class PSICOMP_RBF(Pickleable): + + @Cache_this(limit=2, ignore_args=(0,)) + def psicomputations(self, variance, lengthscale, Z, variational_posterior): + if isinstance(variational_posterior, variational.NormalPosterior): + return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior) + elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + return ssrbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior) + else: + raise ValueError, "unknown distriubtion received for psi-statistics" + + @Cache_this(limit=2, ignore_args=(0,1,2,3)) + def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): + if isinstance(variational_posterior, variational.NormalPosterior): + return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior) + elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior) + else: + raise ValueError, "unknown distriubtion received for psi-statistics" + +class PSICOMP_Linear(Pickleable): + + @Cache_this(limit=2, ignore_args=(0,)) + def psicomputations(self, variance, Z, variational_posterior): + if isinstance(variational_posterior, variational.NormalPosterior): + return linear_psi_comp.psicomputations(variance, Z, variational_posterior) + elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + return sslinear_psi_comp.psicomputations(variance, Z, variational_posterior) + else: + raise ValueError, "unknown distriubtion received for psi-statistics" + + @Cache_this(limit=2, ignore_args=(0,1,2,3)) + def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior): + if isinstance(variational_posterior, variational.NormalPosterior): + return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior) + elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior) + else: + raise ValueError, "unknown distriubtion received for psi-statistics" + + def _setup_observers(self): + pass \ No newline at end of file diff --git a/GPy/kern/_src/psi_comp/linear_psi_comp.py b/GPy/kern/_src/psi_comp/linear_psi_comp.py index 22147366..93297e7e 100644 --- a/GPy/kern/_src/psi_comp/linear_psi_comp.py +++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py @@ -2,14 +2,45 @@ # Licensed under the BSD 3-clause license (see LICENSE.txt) """ -The package for the Psi statistics computation of the linear kernel for SSGPLVM +The package for the Psi statistics computation of the linear kernel for Bayesian GPLVM """ import numpy as np -from GPy.util.caching import Cache_this -#@Cache_this(limit=1) -def _psi2computations(variance, Z, mu, S, gamma): +def psicomputations(variance, Z, variational_posterior): + """ + Compute psi-statistics for ss-linear kernel + """ + # here are the "statistics" for psi0, psi1 and psi2 + # Produced intermediate results: + # psi0 N + # psi1 NxM + # psi2 MxM + mu = variational_posterior.mean + S = variational_posterior.variance + + psi0 = np.einsum('q,nq->n',variance,np.square(mu)+S) + psi1 = np.einsum('q,mq,nq->nm',variance,Z,mu) + psi2 = np.einsum('q,mq,oq,nq->mo',np.square(variance),Z,Z,S) + np.einsum('nm,no->mo',psi1,psi1) + + return psi0, psi1, psi2 + +def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior): + mu = variational_posterior.mean + S = variational_posterior.variance + + dL_dvar, dL_dmu, dL_dS, dL_dZ = _psi2computations(dL_dpsi2, variance, Z, mu, S) + + # Compute for psi0 and psi1 + mu2S = np.square(mu)+S + dL_dvar += np.einsum('n,nq->q',dL_dpsi0,mu2S) + np.einsum('nm,mq,nq->q',dL_dpsi1,Z,mu) + dL_dmu += np.einsum('n,q,nq->nq',dL_dpsi0,2.*variance,mu) + np.einsum('nm,q,mq->nq',dL_dpsi1,variance,Z) + dL_dS += np.einsum('n,q->nq',dL_dpsi0,variance) + dL_dZ += np.einsum('nm,q,nq->mq',dL_dpsi1, variance,mu) + + return dL_dvar, dL_dZ, dL_dmu, dL_dS + +def _psi2computations(dL_dpsi2, variance, Z, mu, S): """ Z - MxQ mu - NxQ @@ -18,34 +49,23 @@ def _psi2computations(variance, Z, mu, S, gamma): """ # here are the "statistics" for psi1 and psi2 # Produced intermediate results: - # _psi2 NxMxM - # _psi2_dvariance NxMxMxQ - # _psi2_dZ NxMxQ - # _psi2_dgamma NxMxMxQ - # _psi2_dmu NxMxMxQ - # _psi2_dS NxMxMxQ + # _psi2_dvariance Q + # _psi2_dZ MxQ + # _psi2_dmu NxQ + # _psi2_dS NxQ - mu2 = np.square(mu) - gamma2 = np.square(gamma) variance2 = np.square(variance) - mu2S = mu2+S # NxQ - common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM - - _dpsi2_dvariance = np.einsum('nq,q,mq,oq->nmoq',2.*(gamma*mu2S-gamma2*mu2),variance,Z,Z)+\ - np.einsum('nq,mq,nq,no->nmoq',gamma,Z,mu,common_sum)+\ - np.einsum('nq,oq,nq,nm->nmoq',gamma,Z,mu,common_sum) - - _dpsi2_dgamma = np.einsum('q,mq,oq,nq->nmoq',variance2,Z,Z,(mu2S-2.*gamma*mu2))+\ - np.einsum('q,mq,nq,no->nmoq',variance,Z,mu,common_sum)+\ - np.einsum('q,oq,nq,nm->nmoq',variance,Z,mu,common_sum) - - _dpsi2_dmu = np.einsum('q,mq,oq,nq,nq->nmoq',variance2,Z,Z,mu,2.*(gamma-gamma2))+\ - np.einsum('nq,q,mq,no->nmoq',gamma,variance,Z,common_sum)+\ - np.einsum('nq,q,oq,nm->nmoq',gamma,variance,Z,common_sum) - - _dpsi2_dS = np.einsum('nq,q,mq,oq->nmoq',gamma,variance2,Z,Z) - - _dpsi2_dZ = 2.*(np.einsum('nq,q,mq,nq->nmq',gamma,variance2,Z,mu2S)+np.einsum('nq,q,nq,nm->nmq',gamma,variance,mu,common_sum) - -np.einsum('nq,q,mq,nq->nmq',gamma2,variance2,Z,mu2)) + common_sum = np.einsum('q,mq,nq->nm',variance,Z,mu) # NxM + Z_expect = np.einsum('mo,mq,oq->q',dL_dpsi2,Z,Z) + common_expect = np.einsum('mo,mq,no->nq',dL_dpsi2+dL_dpsi2.T,Z,common_sum) - return _dpsi2_dvariance, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _dpsi2_dZ \ No newline at end of file + dL_dvar = np.einsum('q,nq,q->q',Z_expect,2.*S,variance)+ np.einsum('nq,nq->q',common_expect,mu) + + dL_dmu = np.einsum('nq,q->nq',common_expect,variance) + + dL_dS = np.empty(S.shape) + dL_dS[:] = np.einsum('q,q->q',Z_expect,variance2) + + dL_dZ = 2.*(np.einsum('om,q,mq,nq->oq',dL_dpsi2,variance2,Z,S)+np.einsum('om,q,nq,nm->oq',dL_dpsi2,variance,mu,common_sum)) + + return dL_dvar, dL_dmu, dL_dS, dL_dZ diff --git a/GPy/kern/_src/psi_comp/rbf_psi_comp.py b/GPy/kern/_src/psi_comp/rbf_psi_comp.py new file mode 100644 index 00000000..93399ea7 --- /dev/null +++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py @@ -0,0 +1,162 @@ +""" +The module for psi-statistics for RBF kernel +""" + +import numpy as np +from GPy.util.caching import Cacher + +def psicomputations(variance, lengthscale, Z, variational_posterior): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi0, psi1 and psi2 + # Produced intermediate results: + # _psi1 NxM + mu = variational_posterior.mean + S = variational_posterior.variance + + psi0 = np.empty(mu.shape[0]) + psi0[:] = variance + psi1 = _psi1computations(variance, lengthscale, Z, mu, S) + psi2 = _psi2computations(variance, lengthscale, Z, mu, S).sum(axis=0) + return psi0, psi1, psi2 + +def __psi1computations(variance, lengthscale, Z, mu, S): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi1 + # Produced intermediate results: + # _psi1 NxM + + lengthscale2 = np.square(lengthscale) + + # psi1 + _psi1_logdenom = np.log(S/lengthscale2+1.).sum(axis=-1) # N + _psi1_log = (_psi1_logdenom[:,None]+np.einsum('nmq,nq->nm',np.square(mu[:,None,:]-Z[None,:,:]),1./(S+lengthscale2)))/(-2.) + _psi1 = variance*np.exp(_psi1_log) + + return _psi1 + +def __psi2computations(variance, lengthscale, Z, mu, S): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi2 + # Produced intermediate results: + # _psi2 MxM + + lengthscale2 = np.square(lengthscale) + + _psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N + _psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM + Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ + denom = 1./(2.*S+lengthscale2) + _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom) + _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2) + + + return _psi2 + +def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): + ARD = (len(lengthscale)!=1) + + dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance) + dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance) + + dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2 + + dL_dlengscale = dl_psi1 + dl_psi2 + if not ARD: + dL_dlengscale = dL_dlengscale.sum() + + dL_dmu = dmu_psi1 + dmu_psi2 + dL_dS = dS_psi1 + dS_psi2 + dL_dZ = dZ_psi1 + dZ_psi2 + + return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS + +def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S): + """ + dL_dpsi1 - NxM + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi1 + # Produced intermediate results: dL_dparams w.r.t. psi1 + # _dL_dvariance 1 + # _dL_dlengthscale Q + # _dL_dZ MxQ + # _dL_dgamma NxQ + # _dL_dmu NxQ + # _dL_dS NxQ + + lengthscale2 = np.square(lengthscale) + + _psi1 = _psi1computations(variance, lengthscale, Z, mu, S) + Lpsi1 = dL_dpsi1*_psi1 + Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ + denom = 1./(S+lengthscale2) + Zmu2_denom = np.square(Zmu)*denom[:,None,:] #NxMxQ + _dL_dvar = Lpsi1.sum()/variance + _dL_dmu = np.einsum('nm,nmq,nq->nq',Lpsi1,Zmu,denom) + _dL_dS = np.einsum('nm,nmq,nq->nq',Lpsi1,(Zmu2_denom-1.),denom)/2. + _dL_dZ = -np.einsum('nm,nmq,nq->mq',Lpsi1,Zmu,denom) + _dL_dl = np.einsum('nm,nmq,nq->q',Lpsi1,(Zmu2_denom+(S/lengthscale2)[:,None,:]),denom*lengthscale) + + return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS + +def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + dL_dpsi2 - MxM + """ + # here are the "statistics" for psi2 + # Produced the derivatives w.r.t. psi2: + # _dL_dvariance 1 + # _dL_dlengthscale Q + # _dL_dZ MxQ + # _dL_dgamma NxQ + # _dL_dmu NxQ + # _dL_dS NxQ + + lengthscale2 = np.square(lengthscale) + denom = 1./(2*S+lengthscale2) + denom2 = np.square(denom) + + _psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM + + Lpsi2 = dL_dpsi2[None,:,:]*_psi2 + Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N + Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ + Lpsi2Z2 = np.einsum('nmo,oq,oq->nq',Lpsi2,Z,Z) #NxQ + Lpsi2Z2p = np.einsum('nmo,mq,oq->nq',Lpsi2,Z,Z) #NxQ + Lpsi2Zhat = Lpsi2Z + Lpsi2Zhat2 = (Lpsi2Z2+Lpsi2Z2p)/2 + + _dL_dvar = Lpsi2sum.sum()*2/variance + _dL_dmu = (-2*denom) * (mu*Lpsi2sum[:,None]-Lpsi2Zhat) + _dL_dS = (2*np.square(denom))*(np.square(mu)*Lpsi2sum[:,None]-2*mu*Lpsi2Zhat+Lpsi2Zhat2) - denom*Lpsi2sum[:,None] + _dL_dZ = -np.einsum('nmo,oq->oq',Lpsi2,Z)/lengthscale2+np.einsum('nmo,oq->mq',Lpsi2,Z)/lengthscale2+ \ + 2*np.einsum('nmo,nq,nq->mq',Lpsi2,mu,denom) - np.einsum('nmo,nq,mq->mq',Lpsi2,denom,Z) - np.einsum('nmo,oq,nq->mq',Lpsi2,Z,denom) + _dL_dl = 2*lengthscale* ((S/lengthscale2*denom+np.square(mu*denom))*Lpsi2sum[:,None]+(Lpsi2Z2-Lpsi2Z2p)/(2*np.square(lengthscale2))- + (2*mu*denom2)*Lpsi2Zhat+denom2*Lpsi2Zhat2).sum(axis=0) + + return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS + +_psi1computations = Cacher(__psi1computations, limit=1) +_psi2computations = Cacher(__psi2computations, limit=1) diff --git a/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py new file mode 100644 index 00000000..dda68bdf --- /dev/null +++ b/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py @@ -0,0 +1,411 @@ +""" +The module for psi-statistics for RBF kernel +""" + +import numpy as np +from ....util.caching import Cache_this +from . import PSICOMP_RBF +from ....util import gpu_init + +try: + import pycuda.gpuarray as gpuarray + from pycuda.compiler import SourceModule + from ....util.linalg_gpu import sum_axis +except: + pass + +gpu_code = """ + // define THREADNUM + + #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) + #define IDX_NMM(n,m1,m2) ((m2*M+m1)*N+n) + #define IDX_NQ(n,q) (q*N+n) + #define IDX_NM(n,m) (m*N+n) + #define IDX_MQ(m,q) (q*M+m) + #define IDX_MM(m1,m2) (m2*M+m1) + #define IDX_NQB(n,q,b) ((b*Q+q)*N+n) + #define IDX_QB(q,b) (b*Q+q) + + // Divide data evenly + __device__ void divide_data(int total_data, int psize, int pidx, int *start, int *end) { + int residue = (total_data)%psize; + if(pidx= blockDim.x) { + for(int i=blockDim.x+threadIdx.x; i=1;s=s/2) { + if(threadIdx.x < s) {array[threadIdx.x] += array[s+threadIdx.x];} + __syncthreads(); + } + } + + __global__ void compDenom(double *log_denom1, double *log_denom2, double *l, double *S, int N, int Q) + { + int n_start, n_end; + divide_data(N, gridDim.x, blockIdx.x, &n_start, &n_end); + + for(int i=n_start*Q+threadIdx.x; in',variance,gamma,np.square(mu)+S) + psi1 = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) + psi2 = np.einsum('nq,q,mq,oq,nq->mo',gamma,np.square(variance),Z,Z,(1-gamma)*np.square(mu)+S) +\ + np.einsum('nm,no->mo',psi1,psi1) + + return psi0, psi1, psi2 + +def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior): + mu = variational_posterior.mean + S = variational_posterior.variance + gamma = variational_posterior.binary_prob + + dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ = _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma) + + # Compute for psi0 and psi1 + mu2S = np.square(mu)+S + dL_dvar += np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu) + dL_dgamma += np.einsum('n,q,nq->nq',dL_dpsi0,variance,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,variance,Z,mu) + dL_dmu += np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*variance,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,variance,Z) + dL_dS += np.einsum('n,nq,q->nq',dL_dpsi0,gamma,variance) + dL_dZ += np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, variance,mu) + + return dL_dvar, dL_dZ, dL_dmu, dL_dS, dL_dgamma + +def _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi1 and psi2 + # Produced intermediate results: + # _psi2_dvariance Q + # _psi2_dZ MxQ + # _psi2_dgamma NxQ + # _psi2_dmu NxQ + # _psi2_dS NxQ + + mu2 = np.square(mu) + gamma2 = np.square(gamma) + variance2 = np.square(variance) + mu2S = mu2+S # NxQ + common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM + Z_expect = np.einsum('mo,mq,oq->q',dL_dpsi2,Z,Z) + common_expect = np.einsum('mo,mq,no->nq',dL_dpsi2+dL_dpsi2.T,Z,common_sum) + + dL_dvar = np.einsum('nq,q,q->q',2.*(gamma*mu2S-gamma2*mu2),variance,Z_expect)+\ + np.einsum('nq,nq,nq->q',common_expect,gamma,mu) + + dL_dgamma = np.einsum('q,q,nq->nq',Z_expect,variance2,(mu2S-2.*gamma*mu2))+\ + np.einsum('nq,q,nq->nq',common_expect,variance,mu) + + dL_dmu = np.einsum('q,q,nq,nq->nq',Z_expect,variance2,mu,2.*(gamma-gamma2))+\ + np.einsum('nq,nq,q->nq',common_expect,gamma,variance) + + dL_dS = np.einsum('q,nq,q->nq',Z_expect,gamma,variance2) + + dL_dZ = 2.*(np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma,variance2,Z,(mu2S-gamma*mu2))+np.einsum('om,nq,q,nq,nm->oq',dL_dpsi2,gamma,variance,mu,common_sum)) + + return dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py index d8414cfb..6302a590 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py @@ -6,15 +6,27 @@ The package for the psi statistics computation """ import numpy as np -from GPy.util.caching import Cache_this -@Cache_this(limit=1) -def _Z_distances(Z): - Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q - Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q - return Zhat, Zdist +def psicomputations(variance, lengthscale, Z, variational_posterior): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi0, psi1 and psi2 + # Produced intermediate results: + # _psi1 NxM + mu = variational_posterior.mean + S = variational_posterior.variance + gamma = variational_posterior.binary_prob + + psi0 = np.empty(mu.shape[0]) + psi0[:] = variance + psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma) + psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma) + return psi0, psi1, psi2 -@Cache_this(limit=1) def _psi1computations(variance, lengthscale, Z, mu, S, gamma): """ Z - MxQ @@ -22,16 +34,10 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma): S - NxQ gamma - NxQ """ - # here are the "statistics" for psi1 and psi2 + # here are the "statistics" for psi1 # Produced intermediate results: # _psi1 NxM - # _dpsi1_dvariance NxM - # _dpsi1_dlengthscale NxMxQ - # _dpsi1_dZ NxMxQ - # _dpsi1_dgamma NxMxQ - # _dpsi1_dmu NxMxQ - # _dpsi1_dS NxMxQ - + lengthscale2 = np.square(lengthscale) # psi1 @@ -40,25 +46,15 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma): _psi1_dist = Z[None, :, :] - mu[:, None, :] # NxMxQ _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom) # NxMxQ _psi1_common = gamma[:,None,:] / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #Nx1xQ - _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom)) # NxMxQ - _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ + _psi1_exponent1 = np.log(gamma[:,None,:]) - (_psi1_dist_sq + np.log(_psi1_denom))/2. # NxMxQ + _psi1_exponent2 = np.log(1.-gamma[:,None,:]) - (np.square(Z[None,:,:])/lengthscale2)/2. # NxMxQ _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2) _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM - _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ - _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ - _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ _psi1 = variance * np.exp(_psi1_exp_sum) # NxM - _dpsi1_dvariance = _psi1 / variance # NxM - _dpsi1_dgamma = _psi1_q * (_psi1_exp_dist_sq/_psi1_denom_sqrt-_psi1_exp_Z) # NxMxQ - _dpsi1_dmu = _psi1_q * (_psi1_exp_dist_sq * _psi1_dist * _psi1_common) # NxMxQ - _dpsi1_dS = _psi1_q * (_psi1_exp_dist_sq * _psi1_common * 0.5 * (_psi1_dist_sq - 1.)) # NxMxQ - _dpsi1_dZ = _psi1_q * (- _psi1_common * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z) # NxMxQ - _dpsi1_dlengthscale = 2.*lengthscale*_psi1_q * (0.5*_psi1_common*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + 0.5*(1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z) # NxMxQ - return _psi1, _dpsi1_dvariance, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _dpsi1_dZ, _dpsi1_dlengthscale + return _psi1 -@Cache_this(limit=1) def _psi2computations(variance, lengthscale, Z, mu, S, gamma): """ Z - MxQ @@ -66,19 +62,14 @@ def _psi2computations(variance, lengthscale, Z, mu, S, gamma): S - NxQ gamma - NxQ """ - # here are the "statistics" for psi1 and psi2 + # here are the "statistics" for psi2 # Produced intermediate results: - # _psi2 NxMxM - # _psi2_dvariance NxMxM - # _psi2_dlengthscale NxMxMxQ - # _psi2_dZ NxMxMxQ - # _psi2_dgamma NxMxMxQ - # _psi2_dmu NxMxMxQ - # _psi2_dS NxMxMxQ + # _psi2 MxM lengthscale2 = np.square(lengthscale) - _psi2_Zhat, _psi2_Zdist = _Z_distances(Z) + _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q + _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ @@ -93,15 +84,116 @@ def _psi2computations(variance, lengthscale, Z, mu, S, gamma): _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2) _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max)) _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM - _psi2_q = np.square(variance) * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ + _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM + + return _psi2 + +def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): + ARD = (len(lengthscale)!=1) + + dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + + dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2 + + dL_dlengscale = dl_psi1 + dl_psi2 + if not ARD: + dL_dlengscale = dL_dlengscale.sum() + + dL_dgamma = dgamma_psi1 + dgamma_psi2 + dL_dmu = dmu_psi1 + dmu_psi2 + dL_dS = dS_psi1 + dS_psi2 + dL_dZ = dZ_psi1 + dZ_psi2 + + return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma + +def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma): + """ + dL_dpsi1 - NxM + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi1 + # Produced intermediate results: dL_dparams w.r.t. psi1 + # _dL_dvariance 1 + # _dL_dlengthscale Q + # _dL_dZ MxQ + # _dL_dgamma NxQ + # _dL_dmu NxQ + # _dL_dS NxQ + + lengthscale2 = np.square(lengthscale) + + # psi1 + _psi1_denom = S / lengthscale2 + 1. # NxQ + _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ + _psi1_dist = Z[None, :, :] - mu[:, None, :] # NxMxQ + _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom[:,None,:]) # NxMxQ + _psi1_common = gamma / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #NxQ + _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom[:, None,:])) # NxMxQ + _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ + _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2) + _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ + _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM + _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ + _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ + _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ + _psi1 = variance * np.exp(_psi1_exp_sum) # NxM + _dL_dvariance = np.einsum('nm,nm->',dL_dpsi1, _psi1)/variance # 1 + _dL_dgamma = np.einsum('nm,nmq,nmq->nq',dL_dpsi1, _psi1_q, (_psi1_exp_dist_sq/_psi1_denom_sqrt[:,None,:]-_psi1_exp_Z)) # NxQ + _dL_dmu = np.einsum('nm, nmq, nmq, nmq, nq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_dist,_psi1_common) # NxQ + _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2. # NxQ + _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z)) + _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z)) + + return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma + +def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + dL_dpsi2 - MxM + """ + # here are the "statistics" for psi2 + # Produced the derivatives w.r.t. psi2: + # _dL_dvariance 1 + # _dL_dlengthscale Q + # _dL_dZ MxQ + # _dL_dgamma NxQ + # _dL_dmu NxQ + # _dL_dS NxQ + + lengthscale2 = np.square(lengthscale) + + _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q + _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q + _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q + _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ + + # psi2 + _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ + _psi2_denom_sqrt = np.sqrt(_psi2_denom) + _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q + _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom[:,None,None,:]) + _psi2_common = gamma/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # NxQ + _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom[:,None,None,:])+np.log(gamma[:,None,None,:]) #N,M,M,Q + _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ + _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2) + _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max)) + _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM + _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ - _psi2 = np.square(variance) * np.exp(_psi2_exp_sum) # N,M,M - _dpsi2_dvariance = 2. * _psi2/variance # NxMxM - _dpsi2_dgamma = _psi2_q * (_psi2_exp_dist_sq/_psi2_denom_sqrt - _psi2_exp_Z) # NxMxMxQ - _dpsi2_dmu = _psi2_q * (-2.*_psi2_common*_psi2_mudist * _psi2_exp_dist_sq) # NxMxMxQ - _dpsi2_dS = _psi2_q * (_psi2_common * (2.*_psi2_mudist_sq - 1.) * _psi2_exp_dist_sq) # NxMxMxQ - _dpsi2_dZ = 2.*_psi2_q * (_psi2_common*(-_psi2_Zdist*_psi2_denom+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z) # NxMxMxQ - _dpsi2_dlengthscale = 2.*lengthscale* _psi2_q * (_psi2_common*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) # NxMxMxQ + _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM + _dL_dvariance = np.einsum('mo,mo->',dL_dpsi2,_psi2)*2./variance + _dL_dgamma = np.einsum('mo,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,(_psi2_exp_dist_sq/_psi2_denom_sqrt[:,None,None,:] - _psi2_exp_Z)) + _dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq) + _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq) + _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z)) + _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z)) - return _psi2, _dpsi2_dvariance, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _dpsi2_dZ, _dpsi2_dlengthscale + return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index f49dc52a..1a9d2058 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -1,535 +1,474 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) """ -The package for the psi statistics computation on GPU +The module for psi-statistics for RBF kernel for Spike-and-Slab GPLVM """ import numpy as np -from GPy.util.caching import Cache_this - +from ....util.caching import Cache_this +from . import PSICOMP_RBF from ....util import gpu_init try: import pycuda.gpuarray as gpuarray - from scikits.cuda import cublas - from pycuda.reduction import ReductionKernel - from pycuda.elementwise import ElementwiseKernel - from ....util import linalg_gpu - - - # The kernel form computing psi1 het_noise - comp_psi1 = ElementwiseKernel( - "double *psi1, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q", - "psi1[i] = comp_psi1_element(var, l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)", - "comp_psi1", - preamble=""" - #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) - #define IDX_NQ(n,q) (q*N+n) - #define IDX_MQ(m,q) (q*M+m) - #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - - __device__ double comp_psi1_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) - { - int n = idx%N; - int m = idx/N; - double psi1_exp=0; - for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - - __device__ double comp_psi2_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) - { - // psi2 (n,m1,m2) - int m2 = idx/(M*N); - int m1 = (idx%(M*N))/N; - int n = idx%N; - - double psi2_exp=0; - for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - - __device__ double comp_dpsi1_dvar_element(double *psi1_neq, double *psi1exp1, double *psi1exp2, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) - { - int n = idx%N; - int m = idx/N; - - double psi1_sum = 0; - for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - - __device__ double comp_dpsi2_dvar_element(double *psi2_neq, double *psi2exp1, double *psi2exp2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) - { - // psi2 (n,m1,m2) - int m2 = idx/(M*N); - int m1 = (idx%(M*N))/N; - int n = idx%N; - - double psi2_sum=0; - for(int q=0;q= blockDim.x) { + for(int i=blockDim.x+threadIdx.x; i=1;s=s/2) { + if(threadIdx.x < s) {array[threadIdx.x] += array[s+threadIdx.x];} + __syncthreads(); + } + } + + __global__ void compDenom(double *log_denom1, double *log_denom2, double *log_gamma, double*log_gamma1, double *gamma, double *l, double *S, int N, int Q) + { + int n_start, n_end; + divide_data(N, gridDim.x, blockIdx.x, &n_start, &n_end); - if self.gpuCacheAll!=None and self.gpuCacheAll['mu_gpu'].shape[0] reallocate - self._releaseMemory() + for(int i=n_start*Q+threadIdx.x; iexp2)?exp1+log1p(exp(exp2-exp1)):exp2+log1p(exp(exp1-exp2)); + } + psi1[IDX_NM(n,m)] = var*exp(log_psi1); + } + } + } + + __global__ void psi2computations(double *psi2, double *psi2n, double *log_denom2, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q) + { + int psi2_idx_start, psi2_idx_end; + __shared__ double psi2_local[THREADNUM]; + divide_data((M+1)*M/2, gridDim.x, blockIdx.x, &psi2_idx_start, &psi2_idx_end); + + for(int psi2_idx=psi2_idx_start; psi2_idxexp2)?exp1+log1p(exp(exp2-exp1)):exp2+log1p(exp(exp1-exp2)); + } + double exp_psi2_n = exp(log_psi2_n); + psi2n[IDX_NMM(n,m1,m2)] = var*var*exp_psi2_n; + if(m1!=m2) { psi2n[IDX_NMM(n,m2,m1)] = var*var*exp_psi2_n;} + psi2_local[threadIdx.x] += exp_psi2_n; + } + __syncthreads(); + reduce_sum(psi2_local, THREADNUM); + if(threadIdx.x==0) { + psi2[IDX_MM(m1,m2)] = var*var*psi2_local[0]; + if(m1!=m2) { psi2[IDX_MM(m2,m1)] = var*var*psi2_local[0]; } + } + __syncthreads(); + } + } + + __global__ void psi1compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dgamma, double *dL_dpsi1, double *psi1, double *log_denom1, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q) + { + int m_start, m_end; + __shared__ double g_local[THREADNUM]; + divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end); + int P = int(ceil(double(N)/THREADNUM)); + + double dvar_local = 0; + for(int q=0;qexp2) { + d_exp1 = 1.; + d_exp2 = exp(exp2-exp1); + } else { + d_exp1 = exp(exp1-exp2); + d_exp2 = 1.; + } + double exp_sum = d_exp1+d_exp2; + + dmu_local += lpsi1*Zmu*d_exp1/(denom*exp_sum); + dS_local += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum); + dgamma_local += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum; + dl_local += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zmq*Zmq/(lq*lq)*d_exp2)/(2.*exp_sum); + g_local[threadIdx.x] = lpsi1*(-Zmu/denom*d_exp1-Zmq/lq*d_exp2)/exp_sum; + } + __syncthreads(); + reduce_sum(g_local, pexp2) { + d_exp1 = 1.; + d_exp2 = exp(exp2-exp1); + } else { + d_exp1 = exp(exp1-exp2); + d_exp2 = 1.; + } + double exp_sum = d_exp1+d_exp2; + + dmu_local += lpsi2*muZhat/denom*d_exp1/exp_sum; + dS_local += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum; + dgamma_local += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum; + dl_local += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZ*dZ/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum; + g_local[threadIdx.x] += 2.*lpsi2*((muZhat/denom-dZ/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum; + } + } + __syncthreads(); + reduce_sum(g_local, p