diff --git a/GPy/core/gp.py b/GPy/core/gp.py index 3252ac08..38a7bb3d 100644 --- a/GPy/core/gp.py +++ b/GPy/core/gp.py @@ -89,7 +89,7 @@ class GP(Model): self.link_parameter(self.kern) self.link_parameter(self.likelihood) - def set_XY(self, X=None, Y=None): + def set_XY(self, X=None, Y=None, trigger_update=True): """ Set the input / output data of the model This is useful if we wish to change our existing data but maintain the same model @@ -99,7 +99,7 @@ class GP(Model): :param Y: output observations :type Y: np.ndarray """ - self.update_model(False) + if trigger_update: self.update_model(False) if Y is not None: if self.normalizer is not None: self.normalizer.scale_by(Y) @@ -123,26 +123,26 @@ class GP(Model): self.link_parameters(self.X) else: self.X = ObsAr(X) - self.update_model(True) - self._trigger_params_changed() + if trigger_update: self.update_model(True) + if trigger_update: self._trigger_params_changed() - def set_X(self,X): + def set_X(self,X, trigger_update=True): """ Set the input data of the model :param X: input observations :type X: np.ndarray """ - self.set_XY(X=X) + self.set_XY(X=X, trigger_update=trigger_update) - def set_Y(self,Y): + def set_Y(self,Y, trigger_update=True): """ Set the output data of the model :param X: output observations :type X: np.ndarray """ - self.set_XY(Y=Y) + self.set_XY(Y=Y, trigger_update=trigger_update) def parameters_changed(self): """ diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py index 111fec6f..dd45a26e 100644 --- a/GPy/core/mapping.py +++ b/GPy/core/mapping.py @@ -1,4 +1,5 @@ # Copyright (c) 2013,2014, GPy authors (see AUTHORS.txt). +# Copyright (c) 2015, James Hensman # Licensed under the BSD 3-clause license (see LICENSE.txt) import sys @@ -7,7 +8,7 @@ import numpy as np class Mapping(Parameterized): """ - Base model for shared behavior between models that can act like a mapping. + Base model for shared mapping behaviours """ def __init__(self, input_dim, output_dim, name='mapping'): @@ -18,49 +19,12 @@ class Mapping(Parameterized): def f(self, X): raise NotImplementedError - def df_dX(self, dL_df, X): - """Evaluate derivatives of mapping outputs with respect to inputs. - - :param dL_df: gradient of the objective with respect to the function. - :type dL_df: ndarray (num_data x output_dim) - :param X: the input locations where derivatives are to be evaluated. - :type X: ndarray (num_data x input_dim) - :returns: matrix containing gradients of the function with respect to the inputs. - """ + def gradients_X(self, dL_dF, X): raise NotImplementedError - def df_dtheta(self, dL_df, X): - """The gradient of the outputs of the mapping with respect to each of the parameters. - - :param dL_df: gradient of the objective with respect to the function. - :type dL_df: ndarray (num_data x output_dim) - :param X: input locations where the function is evaluated. - :type X: ndarray (num_data x input_dim) - :returns: Matrix containing gradients with respect to parameters of each output for each input data. - :rtype: ndarray (num_params length) - """ - + def update_gradients(self, dL_dF, X): raise NotImplementedError - def plot(self, *args): - """ - Plots the mapping associated with the model. - - In one dimension, the function is plotted. - - In two dimensions, a contour-plot shows the function - - In higher dimensions, we've not implemented this yet !TODO! - - Can plot only part of the data and part of the posterior functions - using which_data and which_functions - - This is a convenience function: arguments are passed to - GPy.plotting.matplot_dep.models_plots.plot_mapping - """ - - if "matplotlib" in sys.modules: - from ..plotting.matplot_dep import models_plots - mapping_plots.plot_mapping(self,*args) - else: - raise NameError, "matplotlib package has not been imported." class Bijective_mapping(Mapping): """ @@ -74,72 +38,4 @@ class Bijective_mapping(Mapping): """Inverse mapping from output domain of the function to the inputs.""" raise NotImplementedError -from model import Model - -class Mapping_check_model(Model): - """ - This is a dummy model class used as a base class for checking that the - gradients of a given mapping are implemented correctly. It enables - checkgradient() to be called independently on each mapping. - """ - def __init__(self, mapping=None, dL_df=None, X=None): - num_samples = 20 - if mapping==None: - mapping = GPy.mapping.linear(1, 1) - if X==None: - X = np.random.randn(num_samples, mapping.input_dim) - if dL_df==None: - dL_df = np.ones((num_samples, mapping.output_dim)) - - self.mapping=mapping - self.X = X - self.dL_df = dL_df - self.num_params = self.mapping.num_params - Model.__init__(self) - - - def _get_params(self): - return self.mapping._get_params() - - def _get_param_names(self): - return self.mapping._get_param_names() - - def _set_params(self, x): - self.mapping._set_params(x) - - def log_likelihood(self): - return (self.dL_df*self.mapping.f(self.X)).sum() - - def _log_likelihood_gradients(self): - raise NotImplementedError, "This needs to be implemented to use the Mapping_check_model class." - -class Mapping_check_df_dtheta(Mapping_check_model): - """This class allows gradient checks for the gradient of a mapping with respect to parameters. """ - def __init__(self, mapping=None, dL_df=None, X=None): - Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X) - - def _log_likelihood_gradients(self): - return self.mapping.df_dtheta(self.dL_df, self.X) - - -class Mapping_check_df_dX(Mapping_check_model): - """This class allows gradient checks for the gradient of a mapping with respect to X. """ - def __init__(self, mapping=None, dL_df=None, X=None): - Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X) - - if dL_df==None: - dL_df = np.ones((self.X.shape[0],self.mapping.output_dim)) - self.num_params = self.X.shape[0]*self.mapping.input_dim - - def _log_likelihood_gradients(self): - return self.mapping.df_dX(self.dL_df, self.X).flatten() - - def _get_param_names(self): - return ['X_' +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])] - - def _get_params(self): - return self.X.flatten() - - def _set_params(self, x): - self.X=x.reshape(self.X.shape) diff --git a/GPy/core/model.py b/GPy/core/model.py index c5d318e7..0251d58c 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -213,14 +213,14 @@ class Model(Parameterized): self.obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e10, 1e10) return obj_f, self.obj_grads - def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=True, **kwargs): + def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=True, clear_after_finish=False, **kwargs): """ Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors. kwargs are passed to the optimizer. They can be: - :param max_f_eval: maximum number of function evaluations - :type max_f_eval: int + :param max_iters: maximum number of function evaluations + :type max_iters: int :messages: True: Display messages during optimisation, "ipython_notebook": :type messages: bool"string :param optimizer: which optimizer to use (defaults to self.preferred optimizer) @@ -402,6 +402,7 @@ class Model(Parameterized): model_details = [['Model', self.name + '
'], ['Log-likelihood', '{}
'.format(float(self.log_likelihood()))], ["Number of Parameters", '{}
'.format(self.size)], + ["Number of Optimization Parameters", '{}
'.format(self._size_transformed())], ["Updates", '{}
'.format(self._update_on)], ] from operator import itemgetter @@ -419,6 +420,7 @@ class Model(Parameterized): model_details = [['Name', self.name], ['Log-likelihood', '{}'.format(float(self.log_likelihood()))], ["Number of Parameters", '{}'.format(self.size)], + ["Number of Optimization Parameters", '{}'.format(self._size_transformed())], ["Updates", '{}'.format(self._update_on)], ] from operator import itemgetter diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py index bee160b2..dc083a98 100644 --- a/GPy/core/parameterization/parameter_core.py +++ b/GPy/core/parameterization/parameter_core.py @@ -947,7 +947,7 @@ class Parameterizable(OptimizationHandlable): self._add_parameter_name(param, ignore_added_names) # and makes sure to not delete programmatically added parameters for other in self.parameters[::-1]: - if other is not param and other.name.startswith(param.name): + if other is not param and other.name == param.name: warn_and_retry(param, _name_digit.match(other.name)) return if pname not in dir(self): diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py index 005ef2ac..4fcade79 100644 --- a/GPy/core/sparse_gp.py +++ b/GPy/core/sparse_gp.py @@ -10,11 +10,6 @@ from parameterization.variational import VariationalPosterior, NormalPosterior from ..util.linalg import mdot import logging -from GPy.inference.latent_function_inference.posterior import Posterior -from GPy.inference.optimization.stochastics import SparseGPStochastics,\ - SparseGPMissing -#no stochastics.py file added! from GPy.inference.optimization.stochastics import SparseGPStochastics,\ - #SparseGPMissing logger = logging.getLogger("sparse gp") class SparseGP(GP): @@ -24,6 +19,10 @@ class SparseGP(GP): This model allows (approximate) inference using variational DTC or FITC (Gaussian likelihoods) as well as non-conjugate sparse methods based on these. + + This is not for missing data, as the implementation for missing data involves + some inefficient optimization routine decisions. + See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'. :param X: inputs :type X: np.ndarray (num_data x input_dim) @@ -62,6 +61,14 @@ class SparseGP(GP): def has_uncertain_inputs(self): return isinstance(self.X, VariationalPosterior) + + def set_Z(self, Z, trigger_update=True): + if trigger_update: self.update_model(False) + self.unlink_parameter(self.Z) + self.Z = Param('inducing inputs',Z) + self.link_parameter(self.Z, index=0) + if trigger_update: self.update_model(True) + if trigger_update: self._trigger_params_changed() def parameters_changed(self): self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata) @@ -111,7 +118,7 @@ class SparseGP(GP): For uncertain inputs, the SparseGP bound produces a full covariance structure across D, so for full_cov we return a NxDxD matrix and in the not full_cov case, we return the diagonal elements across D (NxD). - This is for both with and without missing data. + This is for both with and without missing data. See for missing data SparseGP implementation py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'. """ if kern is None: kern = self.kern diff --git a/GPy/core/verbose_optimization.py b/GPy/core/verbose_optimization.py index 1a87b3da..54e650c3 100644 --- a/GPy/core/verbose_optimization.py +++ b/GPy/core/verbose_optimization.py @@ -11,7 +11,7 @@ def exponents(fnow, current_grad): return np.sign(exps) * np.log10(exps).astype(int) class VerboseOptimization(object): - def __init__(self, model, opt, maxiters, verbose=False, current_iteration=0, ipython_notebook=True): + def __init__(self, model, opt, maxiters, verbose=False, current_iteration=0, ipython_notebook=True, clear_after_finish=False): self.verbose = verbose if self.verbose: self.model = model @@ -22,48 +22,52 @@ class VerboseOptimization(object): self.opt_name = opt.opt_name self.model.add_observer(self, self.print_status) self.status = 'running' + self.clear = clear_after_finish self.update() try: from IPython.display import display - from IPython.html.widgets import FloatProgressWidget, HTMLWidget, ContainerWidget - self.text = HTMLWidget() - self.progress = FloatProgressWidget() - self.model_show = HTMLWidget() + from IPython.html.widgets import IntProgress, HTML, Box, VBox, HBox, FlexBox + self.text = HTML(width='100%') + self.progress = IntProgress(min=0, max=maxiters) + #self.progresstext = Text(width='100%', disabled=True, value='0/{}'.format(maxiters)) + self.model_show = HTML() self.ipython_notebook = ipython_notebook except: # Not in Ipython notebook self.ipython_notebook = False if self.ipython_notebook: - self.text.set_css('width', '100%') - #self.progress.set_css('width', '100%') + left_col = VBox(children=[self.progress, self.text], padding=2, width='40%') + right_col = Box(children=[self.model_show], padding=2, width='60%') + self.hor_align = FlexBox(children = [left_col, right_col], width='100%', orientation='horizontal') - left_col = ContainerWidget(children = [self.progress, self.text]) - right_col = ContainerWidget(children = [self.model_show]) - hor_align = ContainerWidget(children = [left_col, right_col]) + display(self.hor_align) + + try: + self.text.set_css('width', '100%') + left_col.set_css({ + 'padding': '2px', + 'width': "100%", + }) + + right_col.set_css({ + 'padding': '2px', + }) + + self.hor_align.set_css({ + 'width': "100%", + }) - display(hor_align) + self.hor_align.remove_class('vbox') + self.hor_align.add_class('hbox') + + left_col.add_class("box-flex1") + right_col.add_class('box-flex0') - left_col.set_css({ - 'padding': '2px', - 'width': "100%", - }) - - right_col.set_css({ - 'padding': '2px', - }) - - hor_align.set_css({ - 'width': "100%", - }) - - hor_align.remove_class('vbox') - hor_align.add_class('hbox') - - left_col.add_class("box-flex1") - right_col.add_class('box-flex0') + except: + pass #self.text.add_class('box-flex2') #self.progress.add_class('box-flex1') @@ -102,7 +106,8 @@ class VerboseOptimization(object): html_body += "{}".format(val) html_body += "" self.text.value = html_begin + html_body + html_end - self.progress.value = 100*(self.iteration+1)/self.maxiters + self.progress.value = (self.iteration+1) + #self.progresstext.value = '0/{}'.format((self.iteration+1)) self.model_show.value = self.model._repr_html_() else: n_exps = exponents(self.fnow, self.current_gradient) @@ -146,5 +151,7 @@ class VerboseOptimization(object): if not self.ipython_notebook: print '' print 'Optimization finished in {0:.5g} Seconds'.format(self.stop-self.start) - print 'Optimization status: {0:.5g}'.format(self.status) + print 'Optimization status: {0:s}'.format(self.status) print + elif self.clear: + self.hor_align.close() diff --git a/GPy/inference/latent_function_inference/expectation_propagation.py b/GPy/inference/latent_function_inference/expectation_propagation.py index 26144974..647823bd 100644 --- a/GPy/inference/latent_function_inference/expectation_propagation.py +++ b/GPy/inference/latent_function_inference/expectation_propagation.py @@ -40,8 +40,11 @@ class EP(LatentFunctionInference): K = kern.K(X) if self._ep_approximation is None: + + #if we don't yet have the results of runnign EP, run EP and store the computed factors in self._ep_approximation mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata) else: + #if we've already run EP, just use the existing approximation stored in self._ep_approximation mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde)) diff --git a/GPy/inference/latent_function_inference/svgp.py b/GPy/inference/latent_function_inference/svgp.py index 1974991b..d4797311 100644 --- a/GPy/inference/latent_function_inference/svgp.py +++ b/GPy/inference/latent_function_inference/svgp.py @@ -47,6 +47,8 @@ class SVGP(LatentFunctionInference): #rescale the F term if working on a batch F, dF_dmu, dF_dv = F*batch_scale, dF_dmu*batch_scale, dF_dv*batch_scale + if dF_dthetaL is not None: + dF_dthetaL = dF_dthetaL.sum(1).sum(1)*batch_scale #derivatives of expected likelihood Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py index 4c72a254..8059f68f 100644 --- a/GPy/kern/_src/add.py +++ b/GPy/kern/_src/add.py @@ -180,9 +180,12 @@ class Add(CombinationKernel): def input_sensitivity(self, summarize=True): if summarize: - return reduce(np.add, [k.input_sensitivity(summarize) for k in self.parts]) + i_s = np.zeros((self.input_dim)) + for k in self.parts: + i_s[k.active_dims] += k.input_sensitivity(summarize) + return i_s else: i_s = np.zeros((len(self.parts), self.input_dim)) from operator import setitem - [setitem(i_s, (i, Ellipsis), k.input_sensitivity(summarize)) for i, k in enumerate(self.parts)] + [setitem(i_s, (i, k.active_dims), k.input_sensitivity(summarize)) for i, k in enumerate(self.parts)] return i_s diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py index 6277c1dc..c398b3a4 100644 --- a/GPy/likelihoods/bernoulli.py +++ b/GPy/likelihoods/bernoulli.py @@ -77,7 +77,7 @@ class Bernoulli(Likelihood): return Z_hat, mu_hat, sigma2_hat - def variational_expectations(self, Y, m, v, gh_points=None): + def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None): if isinstance(self.gp_link, link_functions.Probit): if gh_points is None: diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py index 021ec269..f46339d1 100644 --- a/GPy/likelihoods/gaussian.py +++ b/GPy/likelihoods/gaussian.py @@ -310,18 +310,17 @@ class Gaussian(Likelihood): Ysim = np.array([np.random.normal(self.gp_link.transf(gpj), scale=np.sqrt(self.variance), size=1) for gpj in gp]) return Ysim.reshape(orig_shape) - def log_predictive_density(self, y_test, mu_star, var_star): + def log_predictive_density(self, y_test, mu_star, var_star, Y_metadata=None): """ assumes independence """ v = var_star + self.variance return -0.5*np.log(2*np.pi) -0.5*np.log(v) - 0.5*np.square(y_test - mu_star)/v - def variational_expectations(self, Y, m, v, gh_points=None): + def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None): lik_var = float(self.variance) F = -0.5*np.log(2*np.pi) -0.5*np.log(lik_var) - 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/lik_var dF_dmu = (Y - m)/lik_var dF_dv = np.ones_like(v)*(-0.5/lik_var) - dF_dlik_var = np.sum(-0.5/lik_var + 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/(lik_var**2)) - dF_dtheta = [dF_dlik_var] - return F, dF_dmu, dF_dv, dF_dtheta + dF_dtheta = -0.5/lik_var + 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/(lik_var**2) + return F, dF_dmu, dF_dv, dF_dtheta.reshape(1, Y.shape[0], Y.shape[1]) diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py index ee2f5368..a545d54e 100644 --- a/GPy/likelihoods/likelihood.py +++ b/GPy/likelihoods/likelihood.py @@ -178,7 +178,12 @@ class Likelihood(Parameterized): if np.any(np.isnan(dF_dm)) or np.any(np.isinf(dF_dm)): stop - dF_dtheta = None # Not yet implemented + if self.size: + dF_dtheta = self.dlogpdf_dtheta(X, Y[:,None]) # Ntheta x (orig size) x N_{quad_points} + dF_dtheta = np.dot(dF_dtheta, gh_w) + dF_dtheta = dF_dtheta.reshape(self.size, shape[0], shape[1]) + else: + dF_dtheta = None # Not yet implemented return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), dF_dtheta def predictive_mean(self, mu, variance, Y_metadata=None): diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py index f16a55e9..674972bf 100644 --- a/GPy/likelihoods/student_t.py +++ b/GPy/likelihoods/student_t.py @@ -35,8 +35,8 @@ class StudentT(Likelihood): self.log_concave = False - def parameters_changed(self): - self.variance = (self.v / float(self.v - 2)) * self.sigma2 + #def parameters_changed(self): + #self.variance = (self.v / float(self.v - 2)) * self.sigma2 def update_gradients(self, grads): """ @@ -180,7 +180,8 @@ class StudentT(Likelihood): :rtype: float """ e = y - inv_link_f - dlogpdf_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2)) + e2 = np.square(e) + dlogpdf_dvar = self.v*(e2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e2)) return dlogpdf_dvar def dlogpdf_dlink_dvar(self, inv_link_f, y, Y_metadata=None): diff --git a/GPy/mappings/additive.py b/GPy/mappings/additive.py index 5297982b..4e7c545d 100644 --- a/GPy/mappings/additive.py +++ b/GPy/mappings/additive.py @@ -17,45 +17,25 @@ class Additive(Mapping): :type mapping1: GPy.mappings.Mapping :param mapping2: second mapping to add together. :type mapping2: GPy.mappings.Mapping - :param tensor: whether or not to use the tensor product of input spaces - :type tensor: bool """ - def __init__(self, mapping1, mapping2, tensor=False): - if tensor: - input_dim = mapping1.input_dim + mapping2.input_dim - else: - input_dim = mapping1.input_dim - assert(mapping1.input_dim==mapping2.input_dim) + def __init__(self, mapping1, mapping2): + assert(mapping1.input_dim==mapping2.input_dim) assert(mapping1.output_dim==mapping2.output_dim) - output_dim = mapping1.output_dim + input_dim, output_dim = mapping1.input_dim, mapping1.output_dim Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim) self.mapping1 = mapping1 self.mapping2 = mapping2 self.num_params = self.mapping1.num_params + self.mapping2.num_params self.name = self.mapping1.name + '+' + self.mapping2.name - def _get_param_names(self): - return self.mapping1._get_param_names + self.mapping2._get_param_names - - def _get_params(self): - return np.hstack((self.mapping1._get_params(), self.mapping2._get_params())) - - def _set_params(self, x): - self.mapping1._set_params(x[:self.mapping1.num_params]) - self.mapping2._set_params(x[self.mapping1.num_params:]) - - def randomize(self): - self.mapping1._randomize() - self.mapping2._randomize() def f(self, X): return self.mapping1.f(X) + self.mapping2.f(X) - def df_dtheta(self, dL_df, X): - self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T - self._df_dbias = (dL_df.sum(0)) - return np.hstack((self._df_dA.flatten(), self._df_dbias)) + def update_gradients(self, dL_dF, X): + self.mapping1.update_gradients(dL_dF, X) + self.mapping2.update_gradients(dL_dF, X) - def df_dX(self, dL_df, X): - return self.kern.dK_dX((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X) + def gradients_X(self, dL_dF, X): + return self.mapping1.gradients_X(dL_dF, X) + self.mapping2.gradients_X(dL_dF, X) diff --git a/GPy/mappings/identity.py b/GPy/mappings/identity.py new file mode 100644 index 00000000..b15e476c --- /dev/null +++ b/GPy/mappings/identity.py @@ -0,0 +1,26 @@ +# Copyright (c) 2015, James Hensman + +from ..core.mapping import Mapping +from ..core import Param + +class Identity(Mapping): + """ + A mapping that does nothing! + """ + def __init__(self, input_dim, output_dim, name='identity'): + Mapping.__init__(self, input_dim, output_dim, name) + + def f(self, X): + return X + + def update_gradients(self, dL_dF, X): + pass + + def gradients_X(self, dL_dF, X): + return dL_dF + + + + + + diff --git a/GPy/mappings/kernel.py b/GPy/mappings/kernel.py index 74fa344f..3bfcd388 100644 --- a/GPy/mappings/kernel.py +++ b/GPy/mappings/kernel.py @@ -1,9 +1,10 @@ # Copyright (c) 2013, GPy authors (see AUTHORS.txt). +# Copyright (c) 2015, James Hensman # Licensed under the BSD 3-clause license (see LICENSE.txt) import numpy as np from ..core.mapping import Mapping -import GPy +from ..core import Param class Kernel(Mapping): """ @@ -11,50 +12,41 @@ class Kernel(Mapping): .. math:: - f(\mathbf{x}*) = \mathbf{A}\mathbf{k}(\mathbf{X}, \mathbf{x}^*) + \mathbf{b} + f(\mathbf{x}) = \sum_i \alpha_i k(\mathbf{z}_i, \mathbf{x}) - :param X: input observations containing :math:`\mathbf{X}` - :type X: ndarray + or for multple outputs + + .. math:: + + f_i(\mathbf{x}) = \sum_j \alpha_{i,j} k(\mathbf{z}_i, \mathbf{x}) + + + :param input_dim: dimension of input. + :type input_dim: int :param output_dim: dimension of output. :type output_dim: int + :param Z: input observations containing :math:`\mathbf{Z}` + :type Z: ndarray :param kernel: a GPy kernel, defaults to GPy.kern.RBF :type kernel: GPy.kern.kern """ - def __init__(self, X, output_dim=1, kernel=None): - Mapping.__init__(self, input_dim=X.shape[1], output_dim=output_dim) - if kernel is None: - kernel = GPy.kern.RBF(self.input_dim) + def __init__(self, input_dim, output_dim, Z, kernel, name='kernmap'): + Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name) self.kern = kernel - self.X = X - self.num_data = X.shape[0] - self.num_params = self.output_dim*(self.num_data + 1) - self.A = np.array((self.num_data, self.output_dim)) - self.bias = np.array(self.output_dim) - self.randomize() - self.name = 'kernel' - def _get_param_names(self): - return sum([['A_%i_%i' % (n, d) for d in range(self.output_dim)] for n in range(self.num_data)], []) + ['bias_%i' % d for d in range(self.output_dim)] - - def _get_params(self): - return np.hstack((self.A.flatten(), self.bias)) - - def _set_params(self, x): - self.A = x[:self.num_data * self.output_dim].reshape(self.num_data, self.output_dim).copy() - self.bias = x[self.num_data*self.output_dim:].copy() - - def randomize(self): - self.A = np.random.randn(self.num_data, self.output_dim)/np.sqrt(self.num_data+1) - self.bias = np.random.randn(self.output_dim)/np.sqrt(self.num_data+1) + self.Z = Z + self.num_bases, Zdim = X.shape + assert Zdim == self.input_dim + self.A = GPy.core.Param('A', np.random.randn(self.num_bases, self.output_dim)) + self.add_parameter(self.A) def f(self, X): - return np.dot(self.kern.K(X, self.X),self.A) + self.bias + return np.dot(self.kern.K(X, self.Z), self.A) - def df_dtheta(self, dL_df, X): - self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T - self._df_dbias = (dL_df.sum(0)) - return np.hstack((self._df_dA.flatten(), self._df_dbias)) + def update_gradients(self, dL_dF, X): + self.kern.update_gradients_full(np.dot(dL_dF, self.A.T)) + self.A.gradient = np.dot( self.kern.K(self.Z, X), dL_dF) - def df_dX(self, dL_df, X): - return self.kern.gradients_X((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X) + def gradients_X(self, dL_dF, X): + return self.kern.gradients_X(np.dot(dL_dF, self.A.T), X, self.Z) diff --git a/GPy/mappings/linear.py b/GPy/mappings/linear.py index 315dfc0e..6fc91944 100644 --- a/GPy/mappings/linear.py +++ b/GPy/mappings/linear.py @@ -1,43 +1,39 @@ # Copyright (c) 2013, 2014 GPy authors (see AUTHORS.txt). +# Copyright (c) 2015, James Hensman # Licensed under the BSD 3-clause license (see LICENSE.txt) import numpy as np -from ..core.mapping import Bijective_mapping +from ..core.mapping import Mapping from ..core.parameterization import Param -class Linear(Bijective_mapping): +class Linear(Mapping): """ - Mapping based on a linear model. + A Linear mapping. .. math:: - f(\mathbf{x}*) = \mathbf{W}\mathbf{x}^* + \mathbf{b} + F(\mathbf{x}) = \mathbf{A} \mathbf{x}) - :param X: input observations - :type X: ndarray + + :param input_dim: dimension of input. + :type input_dim: int :param output_dim: dimension of output. :type output_dim: int + :param kernel: a GPy kernel, defaults to GPy.kern.RBF + :type kernel: GPy.kern.kern """ - def __init__(self, input_dim=1, output_dim=1, name='linear'): - Bijective_mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name) - self.W = Param('W',np.array((self.input_dim, self.output_dim))) - self.bias = Param('bias',np.array(self.output_dim)) - self.link_parameters(self.W, self.bias) + def __init__(self, input_dim, output_dim, name='linmap'): + Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name) + self.A = GPy.core.Param('A', np.random.randn(self.input_dim, self.output_dim)) + self.add_parameter(self.A) def f(self, X): - return np.dot(X,self.W) + self.bias + return np.dot(X, self.A) - def g(self, f): - V = np.linalg.solve(np.dot(self.W.T, self.W), W.T) - return np.dot(f-self.bias, V) + def update_gradients(self, dL_dF, X): + self.A.gradient = np.dot( X.T, dL_dF) - def df_dtheta(self, dL_df, X): - df_dW = (dL_df[:, :, None]*X[:, None, :]).sum(0).T - df_dbias = (dL_df.sum(0)) - return np.hstack((df_dW.flatten(), df_dbias)) - - def dL_dX(self, partial, X): - """The gradient of L with respect to the inputs to the mapping, where L is a function that is dependent on the output of the mapping, f.""" - return (partial[:, None, :]*self.W[None, :, :]).sum(2) + def gradients_X(self, dL_dF, X): + return np.dot(dL_dF, self.A.T) diff --git a/GPy/mappings/mlp.py b/GPy/mappings/mlp.py index 46dbc2a9..f22fc07f 100644 --- a/GPy/mappings/mlp.py +++ b/GPy/mappings/mlp.py @@ -3,128 +3,40 @@ import numpy as np from ..core.mapping import Mapping +from ..core import Param class MLP(Mapping): """ - Mapping based on a multi-layer perceptron neural network model. - - .. math:: - - f(\\mathbf{x}*) = \\mathbf{W}^0\\boldsymbol{\\phi}(\\mathbf{W}^1\\mathbf{x}+\\mathbf{b}^1)^* + \\mathbf{b}^0 - - where - - .. math:: - - \\phi(\\cdot) = \\text{tanh}(\\cdot) - - :param X: input observations - :type X: ndarray - :param output_dim: dimension of output. - :type output_dim: int - :param hidden_dim: dimension of hidden layer. If it is an int, there is one hidden layer of the given dimension. If it is a list of ints there are as manny hidden layers as the length of the list, each with the given number of hidden nodes in it. - :type hidden_dim: int or list of ints. - + Mapping based on a multi-layer perceptron neural network model, with a single hidden layer """ - def __init__(self, input_dim=1, output_dim=1, hidden_dim=3): - Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim) - self.name = 'mlp' - if isinstance(hidden_dim, int): - hidden_dim = [hidden_dim] + def __init__(self, input_dim=1, output_dim=1, hidden_dim=3, name='mlpmap'): + super(MLP).__init__(self, input_dim=input_dim, output_dim=output_dim, name=name) self.hidden_dim = hidden_dim - self.activation = [None]*len(self.hidden_dim) - self.W = [] - self._dL_dW = [] - self.bias = [] - self._dL_dbias = [] - self.W.append(np.zeros((self.input_dim, self.hidden_dim[0]))) - self._dL_dW.append(np.zeros((self.input_dim, self.hidden_dim[0]))) - self.bias.append(np.zeros(self.hidden_dim[0])) - self._dL_dbias.append(np.zeros(self.hidden_dim[0])) - self.num_params = self.hidden_dim[0]*(self.input_dim+1) - for h1, h0 in zip(hidden_dim[1:], hidden_dim[0:-1]): - self.W.append(np.zeros((h0, h1))) - self._dL_dW.append(np.zeros((h0, h1))) - self.bias.append(np.zeros(h1)) - self._dL_dbias.append(np.zeros(h1)) - self.num_params += h1*(h0+1) - self.W.append(np.zeros((self.hidden_dim[-1], self.output_dim))) - self._dL_dW.append(np.zeros((self.hidden_dim[-1], self.output_dim))) - self.bias.append(np.zeros(self.output_dim)) - self._dL_dbias.append(np.zeros(self.output_dim)) - self.num_params += self.output_dim*(self.hidden_dim[-1]+1) - self.randomize() + self.W1 = Param('W1', np.random.randn(self.input_dim, self.hidden_dim)) + self.b1 = Param('b1', np.random.randn(self.hidden_dim)) + self.W2 = Param('W2', np.random.randn(self.hidden_dim, self.output_dim)) + self.b2 = Param('b2', np.random.randn(self.output_dim)) - def _get_param_names(self): - return sum([['W%i_%i_%i' % (i, n, d) for n in range(self.W[i].shape[0]) for d in range(self.W[i].shape[1])] + ['bias%i_%i' % (i, d) for d in range(self.W[i].shape[1])] for i in range(len(self.W))], []) - - def _get_params(self): - param = np.array([]) - for W, bias in zip(self.W, self.bias): - param = np.hstack((param, W.flatten(), bias)) - return param - - def _set_params(self, x): - start = 0 - for W, bias in zip(self.W, self.bias): - end = W.shape[0]*W.shape[1]+start - W[:] = x[start:end].reshape(W.shape[0], W.shape[1]).copy() - start = end - end = W.shape[1]+end - bias[:] = x[start:end].copy() - start = end - - def randomize(self): - for W, bias in zip(self.W, self.bias): - W[:] = np.random.randn(W.shape[0], W.shape[1])/np.sqrt(W.shape[0]+1) - bias[:] = np.random.randn(W.shape[1])/np.sqrt(W.shape[0]+1) def f(self, X): - self._f_computations(X) - return np.dot(np.tanh(self.activation[-1]), self.W[-1]) + self.bias[-1] + N, D = X.shape + activations = np.tanh(np.dot(X,self.W1) + self.b1) + self.out = np.dot(self.activations,self.W2) + self.b2 + return self.output_fn(self.out) - def _f_computations(self, X): - W = self.W[0] - bias = self.bias[0] - self.activation[0] = np.dot(X,W) + bias - for W, bias, index in zip(self.W[1:-1], self.bias[1:-1], range(1, len(self.activation))): - self.activation[index] = np.dot(np.tanh(self.activation[index-1]), W)+bias + def update_gradients(self, dL_dF, X): + activations = np.tanh(np.dot(X,self.W1) + self.b1) - def df_dtheta(self, dL_df, X): - self._df_computations(dL_df, X) - g = np.array([]) - for gW, gbias in zip(self._dL_dW, self._dL_dbias): - g = np.hstack((g, gW.flatten(), gbias)) - return g - def _df_computations(self, dL_df, X): - self._f_computations(X) - a0 = self.activation[-1] - W = self.W[-1] - self._dL_dW[-1] = (dL_df[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T - dL_dta=(dL_df[:, None, :]*W[None, :, :]).sum(2) - self._dL_dbias[-1] = (dL_df.sum(0)) - for dL_dW, dL_dbias, W, bias, a0, a1 in zip(self._dL_dW[-2:0:-1], - self._dL_dbias[-2:0:-1], - self.W[-2:0:-1], - self.bias[-2:0:-1], - self.activation[-2::-1], - self.activation[-1:0:-1]): - ta = np.tanh(a1) - dL_da = dL_dta*(1-ta*ta) - dL_dW[:] = (dL_da[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T - dL_dbias[:] = (dL_da.sum(0)) - dL_dta = (dL_da[:, None, :]*W[None, :, :]).sum(2) - ta = np.tanh(self.activation[0]) - dL_da = dL_dta*(1-ta*ta) - W = self.W[0] - self._dL_dW[0] = (dL_da[:, :, None]*X[:, None, :]).sum(0).T - self._dL_dbias[0] = (dL_da.sum(0)) - self._dL_dX = (dL_da[:, None, :]*W[None, :, :]).sum(2) + #Evaluate second-layer gradients. + self.W2.gradient = np.dot(activations.T, dL_dF) + self.b2.gradient = np.sum(dL_dF, 0) + + # Backpropagation to hidden layer. + delta_hid = np.dot(dL_dF, self.W2.T) * (1.0 - activations**2) + + # Finally, evaluate the first-layer gradients. + self.W1.gradients = np.dot(X.T,delta_hid) + self.b1.gradients = np.sum(delta_hid, 0) - - def df_dX(self, dL_df, X): - self._df_computations(dL_df, X) - return self._dL_dX - diff --git a/GPy/mappings/piecewise_linear.py b/GPy/mappings/piecewise_linear.py new file mode 100644 index 00000000..8bdee81e --- /dev/null +++ b/GPy/mappings/piecewise_linear.py @@ -0,0 +1,94 @@ +from GPy.core.mapping import Mapping +from GPy.core import Param +import numpy as np + +class PiecewiseLinear(Mapping): + """ + A piecewise-linear mapping. + + The parameters of this mapping are the positions and values of the function where it is broken (self.breaks, self.values). + + Outside the range of the breaks, the function is assumed to have gradient 1 + """ + def __init__(self, input_dim, output_dim, values, breaks, name='piecewise_linear'): + + assert input_dim==1 + assert output_dim==1 + + Mapping.__init__(self, input_dim, output_dim, name) + + values, breaks = np.array(values).flatten(), np.array(breaks).flatten() + assert values.size == breaks.size + self.values = Param('values', values) + self.breaks = Param('breaks', breaks) + self.link_parameter(self.values) + self.link_parameter(self.breaks) + + def parameters_changed(self): + self.order = np.argsort(self.breaks)*1 + self.reverse_order = np.zeros_like(self.order) + self.reverse_order[self.order] = np.arange(self.order.size) + + self.sorted_breaks = self.breaks[self.order] + self.sorted_values = self.values[self.order] + + self.grads = np.diff(self.sorted_values)/np.diff(self.sorted_breaks) + + def f(self, X): + x = X.flatten() + y = x.copy() + + #first adjus the points below the first value + y[xself.sorted_breaks[-1]] = x[x>self.sorted_breaks[-1]] + self.sorted_values[-1] - self.sorted_breaks[-1] + + #loop throught the pairs of points + for low, up, g, v in zip(self. sorted_breaks[:-1], self.sorted_breaks[1:], self.grads, self.sorted_values[:-1]): + i = np.logical_and(x>low, xlow, xself.sorted_breaks[-1]]) + dL_dv[0] += np.sum(dL_dF[xself.sorted_breaks[-1]]) + + #now put the gradients back in the correct order! + self.breaks.gradient = dL_db[self.reverse_order] + self.values.gradient = dL_dv[self.reverse_order] + + def gradients_X(self, dL_dF, X): + x = X.flatten() + + #outside the range of the breakpoints, the function is just offset by a contant, so the partial derivative is 1. + dL_dX = dL_dF.copy().flatten() + + #insude the breakpoints, the partial derivative is self.grads + for low, up, g, v in zip(self. sorted_breaks[:-1], self.sorted_breaks[1:], self.grads, self.sorted_values[:-1]): + i = np.logical_and(x>low, x0, 1,0) + + lik = GPy.likelihoods.Bernoulli() + k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6) + self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k) + def test_grad(self): + assert self.m.checkgrad(step=1e-4)