diff --git a/GPy/mappings/__init__.py b/GPy/mappings/__init__.py index 73390b87..795352af 100644 --- a/GPy/mappings/__init__.py +++ b/GPy/mappings/__init__.py @@ -4,6 +4,7 @@ from .kernel import Kernel from .linear import Linear from .mlp import MLP +from .mlpext import MLPext from .additive import Additive from .compound import Compound from .constant import Constant diff --git a/GPy/mappings/mlpext.py b/GPy/mappings/mlpext.py new file mode 100644 index 00000000..c76ddf3e --- /dev/null +++ b/GPy/mappings/mlpext.py @@ -0,0 +1,132 @@ +# Copyright (c) 2017, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +from ..core.mapping import Mapping +from ..core import Param + +class MLPext(Mapping): + """ + Mapping based on a multi-layer perceptron neural network model, with multiple hidden layers. Activation function + is applied to all hidden layers. The output is a linear combination of the last layer features, i.e. the + last layer is linear. + """ + + def __init__(self, input_dim=1, output_dim=1, hidden_dims=[3], prior=None, activation='tanh', name='mlpmap'): + + """ + :param input_dim: number of input dimensions + :param output_dim: number of output dimensions + :param hidden_dims: list of hidden sizes of hidden layers + :param prior: variance of Gaussian prior on all variables. If None, no prior is used (default: None) + :param activation: choose activation function. Allowed values are 'tanh' and 'sigmoid' + :param name: + """ + super(MLPext, self).__init__(input_dim=input_dim, output_dim=output_dim, name=name) + assert activation in ['tanh', 'sigmoid', 'relu'], NotImplementedError('Only tanh, relu and sigmoid activations' + 'are implemented') + self.hidden_dims = hidden_dims + self.W_list = list() + self.b_list = list() + for i in np.arange(len(hidden_dims) + 1): + in_dim = input_dim if i == 0 else hidden_dims[i - 1] + out_dim = output_dim if i == len(hidden_dims) else hidden_dims[i] + self.W_list.append(Param('W%d'%i, np.random.randn(in_dim, out_dim))) + self.b_list.append(Param('b%d'%i, np.random.randn(out_dim))) + + if prior is not None: + for W, b in zip(self.W_list, self.b_list): + W.set_prior(Gaussian(0, prior)) + b.set_prior(Gaussian(0, prior)) + + self.link_parameters(*self.W_list) + self.link_parameters(*self.b_list) + + if activation == 'tanh': + self.act = np.tanh + self.grad_act = lambda x: 1. / np.square(np.cosh(x)) + + elif activation == 'sigmoid': + from scipy.special import expit + from scipy.stats import logistic + self.act = expit + self.grad_act = logistic._pdf + + elif activation == 'relu': + self.act = lambda x: x * (x > 0) + self.grad_act = lambda x: 1. * (x > 0) + + def f(self, X): + net = X + for W, b, i in zip(self.W_list, self.b_list, np.arange(len(self.W_list))): + net = np.dot(net, W) + net = net + b + if i < len(self.W_list)-1: + # Don't apply nonlinearity to last layer outputs + net = self.act(net) + return net + + def _f_preactivations(self, X): + """Computes the network preactivations, i.e. the results of all intermediate linear layers before applying the + activation function on them + :param X: input data + :return: list of preactivations [X, XW+b, f(XW+b)W+b, ...] + """ + + preactivations_list = list() + net = X + preactivations_list.append(X) + + for W, b, i in zip(self.W_list, self.b_list, np.arange(len(self.W_list))): + net = np.dot(net, W) + net = net + b + if i < len(self.W_list) - 1: + preactivations_list.append(net) + net = self.act(net) + return preactivations_list + + def update_gradients(self, dL_dF, X): + preactivations_list = self._f_preactivations(X) + d_dact = dL_dF + d_dlayer = d_dact + for W, b, preactivation, i in zip(reversed(self.W_list), reversed(self.b_list), reversed(preactivations_list), + reversed(np.arange(len(self.W_list)))): + if i > 0: + # Apply activation function to linear preactivations to get input from previous layer + # (except for first layer where input is X) + activation = self.act(preactivation) + else: + activation = preactivation + W.gradient = np.dot(activation.T, d_dlayer) + b.gradient = np.sum(d_dlayer, 0) + + if i > 0: + # Don't need this computation if we are at the bottom layer + d_dact = np.dot(d_dlayer, W.T) + # d_dlayer = d_dact / np.square(np.cosh(preactivation)) + d_dlayer = d_dact * self.grad_act(preactivation) + + def fix_parameters(self): + """Helper function that fixes all parameters""" + for W, b in zip(self.W_list, self.b_list): + W.fix() + b.fix() + + def unfix_parameters(self): + """Helper function that unfixes all parameters""" + for W, b in zip(self.W_list, self.b_list): + W.unfix() + b.unfix() + + def gradients_X(self, dL_dF, X): + preactivations_list = self._f_preactivations(X) + d_dact = dL_dF + d_dlayer = d_dact + for W, preactivation, i in zip(reversed(self.W_list), reversed(preactivations_list), + reversed(np.arange(len(self.W_list)))): + + # Backpropagation through hidden layer. + d_dact = np.dot(d_dlayer, W.T) + d_dlayer = d_dact * self.grad_act(preactivation) + + return d_dact diff --git a/GPy/testing/mapping_tests.py b/GPy/testing/mapping_tests.py index 2ff0e2d8..d07561ab 100644 --- a/GPy/testing/mapping_tests.py +++ b/GPy/testing/mapping_tests.py @@ -44,6 +44,13 @@ class MappingTests(unittest.TestCase): X = np.random.randn(100,3) self.assertTrue(MappingGradChecker(mapping, X).checkgrad()) + def test_mlpextmapping(self): + np.random.seed(42) + X = np.random.randn(100,3) + for activation in ['tanh', 'relu', 'sigmoid']: + mapping = GPy.mappings.MLPext(input_dim=3, hidden_dims=[5,5], output_dim=2, activation=activation) + self.assertTrue(MappingGradChecker(mapping, X).checkgrad()) + def test_addmapping(self): m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2) m2 = GPy.mappings.Linear(input_dim=3, output_dim=2)