lots of changes to mappings

2026-05-08 19:42:39 +02:00 · 2015-03-23 14:24:32 +00:00 · 2015-03-23 14:24:32 +00:00 · 9976e56bea
commit 9976e56bea
parent 99545500b1
7 changed files with 202 additions and 238 deletions
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -1,4 +1,5 @@
 # Copyright (c) 2013,2014, GPy authors (see AUTHORS.txt).
 # Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import sys
@ -7,7 +8,7 @@ import numpy as np
 class Mapping(Parameterized):
    """
-    Base model for shared behavior between models that can act like a mapping.
+    Base model for shared mapping behaviours
    """
    def __init__(self, input_dim, output_dim, name='mapping'):
@ -18,49 +19,12 @@ class Mapping(Parameterized):
    def f(self, X):
        raise NotImplementedError
-    def df_dX(self, dL_df, X):
+    def gradients_X(self, dL_dF, X):
        """Evaluate derivatives of mapping outputs with respect to inputs.
        :param dL_df: gradient of the objective with respect to the function.
        :type dL_df: ndarray (num_data x output_dim)
        :param X: the input locations where derivatives are to be evaluated.
        :type X: ndarray (num_data x input_dim)
        :returns: matrix containing gradients of the function with respect to the inputs.
        """
        raise NotImplementedError
-    def df_dtheta(self, dL_df, X):
+    def update_gradients(self, dL_dF, X):
        """The gradient of the outputs of the mapping with respect to each of the parameters.
        :param dL_df: gradient of the objective with respect to the function.
        :type dL_df: ndarray (num_data x output_dim)
        :param X: input locations where the function is evaluated.
        :type X: ndarray (num_data x input_dim)
        :returns: Matrix containing gradients with respect to parameters of each output for each input data.
        :rtype: ndarray (num_params length)
        """
        raise NotImplementedError
    def plot(self, *args):
        """
        Plots the mapping associated with the model.
          - In one dimension, the function is plotted.
          - In two dimensions, a contour-plot shows the function
          - In higher dimensions, we've not implemented this yet !TODO!
        Can plot only part of the data and part of the posterior functions
        using which_data and which_functions
        This is a convenience function: arguments are passed to
        GPy.plotting.matplot_dep.models_plots.plot_mapping
        """
        if "matplotlib" in sys.modules:
            from ..plotting.matplot_dep import models_plots
            mapping_plots.plot_mapping(self,*args)
        else:
            raise NameError, "matplotlib package has not been imported."
 class Bijective_mapping(Mapping):
    """
--- a/GPy/mappings/additive.py
+++ b/GPy/mappings/additive.py
@ -17,45 +17,25 @@ class Additive(Mapping):
    :type mapping1: GPy.mappings.Mapping
    :param mapping2: second mapping to add together.
    :type mapping2: GPy.mappings.Mapping
    :param tensor: whether or not to use the tensor product of input spaces
    :type tensor: bool
    """
-    def __init__(self, mapping1, mapping2, tensor=False):
+    def __init__(self, mapping1, mapping2):
-        if tensor:
+        assert(mapping1.input_dim==mapping2.input_dim)
            input_dim = mapping1.input_dim + mapping2.input_dim
        else:
            input_dim = mapping1.input_dim
            assert(mapping1.input_dim==mapping2.input_dim)
        assert(mapping1.output_dim==mapping2.output_dim)
-        output_dim = mapping1.output_dim
+        input_dim, output_dim = mapping1.input_dim, mapping1.output_dim
        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
        self.mapping1 = mapping1
        self.mapping2 = mapping2
        self.num_params = self.mapping1.num_params + self.mapping2.num_params
        self.name = self.mapping1.name + '+' + self.mapping2.name
    def _get_param_names(self):
        return self.mapping1._get_param_names + self.mapping2._get_param_names
    def _get_params(self):
        return np.hstack((self.mapping1._get_params(), self.mapping2._get_params()))
    def _set_params(self, x):
        self.mapping1._set_params(x[:self.mapping1.num_params])
        self.mapping2._set_params(x[self.mapping1.num_params:])
    def randomize(self):
        self.mapping1._randomize()
        self.mapping2._randomize()
    def f(self, X):
        return self.mapping1.f(X) + self.mapping2.f(X)
-    def df_dtheta(self, dL_df, X):
+    def update_gradients(self, dL_dF, X):
-        self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T
+        self.mapping1.update_gradients(dL_dF, X)
-        self._df_dbias = (dL_df.sum(0))
+        self.mapping2.update_gradients(dL_dF, X)
        return np.hstack((self._df_dA.flatten(), self._df_dbias))
-    def df_dX(self, dL_df, X):
+    def gradients_X(self, dL_dF, X):
-        return self.kern.dK_dX((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X) 
+        return self.mapping1.gradients_X(dL_dF, X) + self.mapping2.gradients_X(dL_dF, X)
--- a/GPy/mappings/identity.py
+++ b/GPy/mappings/identity.py
@ -0,0 +1,26 @@
 # Copyright (c) 2015, James Hensman
 from ..core.mapping import Mapping
 from ..core import Param
 class Identity(Mapping):
    """
    A mapping that does nothing!
    """
    def __init__(self, input_dim, output_dim, name='identity'):
        Mapping.__init__(self, input_dim, output_dim, name)
    def f(self, X):
        return X
    def update_gradients(self, dL_dF, X):
        pass
    def gradients_X(self, dL_dF, X):
        return dL_dF
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@ -1,9 +1,10 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core.mapping import Mapping
-import GPy
+from ..core import Param
 class Kernel(Mapping):
    """
@ -11,50 +12,41 @@ class Kernel(Mapping):
    .. math::
-       f(\mathbf{x}*) = \mathbf{A}\mathbf{k}(\mathbf{X}, \mathbf{x}^*) + \mathbf{b}
+       f(\mathbf{x}) = \sum_i \alpha_i k(\mathbf{z}_i, \mathbf{x})
-    :param X: input observations containing :math:`\mathbf{X}`
+    or for multple outputs
-    :type X: ndarray
+
    .. math::
       f_i(\mathbf{x}) = \sum_j \alpha_{i,j} k(\mathbf{z}_i, \mathbf{x})
    :param input_dim: dimension of input.
    :type input_dim: int
    :param output_dim: dimension of output.
    :type output_dim: int
    :param Z: input observations containing :math:`\mathbf{Z}`
    :type Z: ndarray
    :param kernel: a GPy kernel, defaults to GPy.kern.RBF
    :type kernel: GPy.kern.kern
    """
-    def __init__(self, X, output_dim=1, kernel=None):
+    def __init__(self, input_dim, output_dim, Z, kernel, name='kernmap'):
-        Mapping.__init__(self, input_dim=X.shape[1], output_dim=output_dim)
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
        if kernel is None:
            kernel = GPy.kern.RBF(self.input_dim)
        self.kern = kernel
-        self.X = X
+        self.Z = Z
-        self.num_data = X.shape[0]
+        self.num_bases, Zdim = X.shape
-        self.num_params = self.output_dim*(self.num_data + 1)
+        assert Zdim == self.input_dim
-        self.A = np.array((self.num_data, self.output_dim))
+        self.A = GPy.core.Param('A', np.random.randn(self.num_bases, self.output_dim))
-        self.bias = np.array(self.output_dim)
+        self.add_parameter(self.A)
        self.randomize()
        self.name = 'kernel'
    def _get_param_names(self):
        return sum([['A_%i_%i' % (n, d) for d in range(self.output_dim)] for n in range(self.num_data)], []) + ['bias_%i' % d for d in range(self.output_dim)]
    def _get_params(self):
        return np.hstack((self.A.flatten(), self.bias))
    def _set_params(self, x):
        self.A = x[:self.num_data * self.output_dim].reshape(self.num_data, self.output_dim).copy()
        self.bias = x[self.num_data*self.output_dim:].copy()
    def randomize(self):
        self.A = np.random.randn(self.num_data, self.output_dim)/np.sqrt(self.num_data+1)
        self.bias = np.random.randn(self.output_dim)/np.sqrt(self.num_data+1)
    def f(self, X):
-        return np.dot(self.kern.K(X, self.X),self.A) + self.bias
+        return np.dot(self.kern.K(X, self.Z), self.A)
-    def df_dtheta(self, dL_df, X):
+    def update_gradients(self, dL_dF, X):
-        self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T
+        self.kern.update_gradients_full(np.dot(dL_dF, self.A.T))
-        self._df_dbias = (dL_df.sum(0))
+        self.A.gradient = np.dot( self.kern.K(self.Z, X), dL_dF)
        return np.hstack((self._df_dA.flatten(), self._df_dbias))
-    def df_dX(self, dL_df, X):
+    def gradients_X(self, dL_dF, X):
-        return self.kern.gradients_X((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X)
+        return self.kern.gradients_X(np.dot(dL_dF, self.A.T), X, self.Z)
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@ -1,43 +1,39 @@
 # Copyright (c) 2013, 2014 GPy authors (see AUTHORS.txt).
 # Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
-from ..core.mapping import Bijective_mapping
+from ..core.mapping import Mapping
 from ..core.parameterization import Param
-class Linear(Bijective_mapping):
+class Linear(Mapping):
    """
-    Mapping based on a linear model.
+    A Linear mapping.
    .. math::
-       f(\mathbf{x}*) = \mathbf{W}\mathbf{x}^* + \mathbf{b}
+       F(\mathbf{x}) = \mathbf{A} \mathbf{x})
-    :param X: input observations
+
-    :type X: ndarray
+    :param input_dim: dimension of input.
    :type input_dim: int
    :param output_dim: dimension of output.
    :type output_dim: int
    :param kernel: a GPy kernel, defaults to GPy.kern.RBF
    :type kernel: GPy.kern.kern
    """
-    def __init__(self, input_dim=1, output_dim=1, name='linear'):
+    def __init__(self, input_dim, output_dim, name='linmap'):
-        Bijective_mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
-        self.W = Param('W',np.array((self.input_dim, self.output_dim)))
+        self.A = GPy.core.Param('A', np.random.randn(self.input_dim, self.output_dim))
-        self.bias = Param('bias',np.array(self.output_dim))
+        self.add_parameter(self.A)
        self.link_parameters(self.W, self.bias)
    def f(self, X):
-        return np.dot(X,self.W) + self.bias
+        return np.dot(X, self.A)
-    def g(self, f):
+    def update_gradients(self, dL_dF, X):
-        V = np.linalg.solve(np.dot(self.W.T, self.W), W.T)
+        self.A.gradient = np.dot( X.T dL_dF)
        return np.dot(f-self.bias, V)  
-    def df_dtheta(self, dL_df, X):
+    def gradients_X(self, dL_dF, X):
-        df_dW = (dL_df[:, :, None]*X[:, None, :]).sum(0).T
+        return np.dot(dL_dF, self.A.T)
        df_dbias = (dL_df.sum(0))
        return np.hstack((df_dW.flatten(), df_dbias))
    def dL_dX(self, partial, X):
        """The gradient of L with respect to the inputs to the mapping, where L is a function that is dependent on the output of the mapping, f."""
        return (partial[:, None, :]*self.W[None, :, :]).sum(2)
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@ -3,128 +3,40 @@
 import numpy as np
 from ..core.mapping import Mapping
 from ..core import Param
 class MLP(Mapping):
    """
-    Mapping based on a multi-layer perceptron neural network model.
+    Mapping based on a multi-layer perceptron neural network model, with a single hidden layer
    .. math::
       f(\\mathbf{x}*) = \\mathbf{W}^0\\boldsymbol{\\phi}(\\mathbf{W}^1\\mathbf{x}+\\mathbf{b}^1)^* + \\mathbf{b}^0
    where
    .. math::
      \\phi(\\cdot) = \\text{tanh}(\\cdot)
    :param X: input observations
    :type X: ndarray
    :param output_dim: dimension of output.
    :type output_dim: int
    :param hidden_dim: dimension of hidden layer. If it is an int, there is one hidden layer of the given dimension. If it is a list of ints there are as manny hidden layers as the length of the list, each with the given number of hidden nodes in it.
    :type hidden_dim: int or list of ints. 
    """
-    def __init__(self, input_dim=1, output_dim=1, hidden_dim=3):
+    def __init__(self, input_dim=1, output_dim=1, hidden_dim=3, name='mlpmap'):
-        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
+        super(MLP).__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
        self.name = 'mlp'
        if isinstance(hidden_dim, int):
            hidden_dim = [hidden_dim]
        self.hidden_dim = hidden_dim
-        self.activation = [None]*len(self.hidden_dim)
+        self.W1 = Param('W1', np.random.randn(self.input_dim, self.hidden_dim))
-        self.W = []
+        self.b1 = Param('b1', np.random.randn(self.hidden_dim))
-        self._dL_dW = []
+        self.W2 = Param('W2', np.random.randn(self.hidden_dim, self.output_dim))
-        self.bias = []
+        self.b2 = Param('b2', np.random.randn(self.output_dim))
        self._dL_dbias = []
        self.W.append(np.zeros((self.input_dim, self.hidden_dim[0])))
        self._dL_dW.append(np.zeros((self.input_dim, self.hidden_dim[0])))
        self.bias.append(np.zeros(self.hidden_dim[0]))
        self._dL_dbias.append(np.zeros(self.hidden_dim[0]))
        self.num_params = self.hidden_dim[0]*(self.input_dim+1)
        for h1, h0 in zip(hidden_dim[1:], hidden_dim[0:-1]):
            self.W.append(np.zeros((h0, h1)))
            self._dL_dW.append(np.zeros((h0, h1)))
            self.bias.append(np.zeros(h1))
            self._dL_dbias.append(np.zeros(h1))
            self.num_params += h1*(h0+1)
        self.W.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
        self._dL_dW.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
        self.bias.append(np.zeros(self.output_dim))
        self._dL_dbias.append(np.zeros(self.output_dim))
        self.num_params += self.output_dim*(self.hidden_dim[-1]+1)
        self.randomize()
    def _get_param_names(self):
        return sum([['W%i_%i_%i' % (i, n, d)  for n in range(self.W[i].shape[0]) for d in range(self.W[i].shape[1])] + ['bias%i_%i' % (i, d) for d in range(self.W[i].shape[1])] for i in range(len(self.W))], [])
    def _get_params(self):
        param = np.array([])
        for W, bias in zip(self.W, self.bias):
            param = np.hstack((param, W.flatten(), bias))
        return param
    def _set_params(self, x):
        start = 0
        for W, bias in zip(self.W, self.bias):
            end = W.shape[0]*W.shape[1]+start
            W[:] = x[start:end].reshape(W.shape[0], W.shape[1]).copy()
            start = end
            end = W.shape[1]+end
            bias[:] = x[start:end].copy()
            start = end
    def randomize(self):
        for W, bias in zip(self.W, self.bias):
            W[:] = np.random.randn(W.shape[0], W.shape[1])/np.sqrt(W.shape[0]+1)
            bias[:] = np.random.randn(W.shape[1])/np.sqrt(W.shape[0]+1)
    def f(self, X):
-        self._f_computations(X)
+        N, D = X.shape
-        return np.dot(np.tanh(self.activation[-1]), self.W[-1]) + self.bias[-1]
+        activations = np.tanh(np.dot(X,self.W1) + self.b1)
        self.out = np.dot(self.activations,self.W2) + self.b2
        return self.output_fn(self.out)
-    def _f_computations(self, X):
+    def update_gradients(self, dL_dF, X):
-        W = self.W[0]
+        activations = np.tanh(np.dot(X,self.W1) + self.b1)
        bias = self.bias[0]
        self.activation[0] = np.dot(X,W) + bias
        for W, bias, index in zip(self.W[1:-1], self.bias[1:-1], range(1, len(self.activation))):
            self.activation[index] = np.dot(np.tanh(self.activation[index-1]), W)+bias
    def df_dtheta(self, dL_df, X):
        self._df_computations(dL_df, X)
        g = np.array([])
        for gW, gbias in zip(self._dL_dW, self._dL_dbias):
            g = np.hstack((g, gW.flatten(), gbias))
        return g
    def _df_computations(self, dL_df, X):
        self._f_computations(X)
        a0 = self.activation[-1]
        W = self.W[-1]
        self._dL_dW[-1] = (dL_df[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
        dL_dta=(dL_df[:, None, :]*W[None, :, :]).sum(2)
        self._dL_dbias[-1] = (dL_df.sum(0))
        for dL_dW, dL_dbias, W, bias, a0, a1 in zip(self._dL_dW[-2:0:-1],
                                                    self._dL_dbias[-2:0:-1],
                                                    self.W[-2:0:-1],
                                                    self.bias[-2:0:-1],
                                                    self.activation[-2::-1],
                                                    self.activation[-1:0:-1]):
            ta = np.tanh(a1)
            dL_da = dL_dta*(1-ta*ta)
            dL_dW[:] = (dL_da[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
            dL_dbias[:] = (dL_da.sum(0))
            dL_dta = (dL_da[:, None, :]*W[None, :, :]).sum(2)
        ta = np.tanh(self.activation[0])
        dL_da = dL_dta*(1-ta*ta)
        W = self.W[0]
        self._dL_dW[0] = (dL_da[:, :, None]*X[:, None, :]).sum(0).T
        self._dL_dbias[0] = (dL_da.sum(0))
        self._dL_dX = (dL_da[:, None, :]*W[None, :, :]).sum(2)
-    def df_dX(self, dL_df, X):
+        #Evaluate second-layer gradients.
-        self._df_computations(dL_df, X)
+        self.W2.gradient = np.dot(activations.T, dL_dF)
-        return self._dL_dX
+        self.b2.gradient = np.sum(dL_dF, 0)
        # Backpropagation to hidden layer.
        delta_hid = np.dot(dL_dF, self.W2.T) * (1.0 - activations**2)
        # Finally, evaluate the first-layer gradients.
        self.W1.gradients = np.dot(X.T,delta_hid)
        self.b1.gradients = np.sum(delta_hid, 0)
--- a/GPy/mappings/piecewise_linear.py
+++ b/GPy/mappings/piecewise_linear.py
@ -0,0 +1,94 @@
 from GPy.core.mapping import Mapping
 from GPy.core import Param
 import numpy as np
 class PiecewiseLinear(Mapping):
    """
    A piecewise-linear mapping.
    The parameters of this mapping are the positions and values of the function where it is broken (self.breaks, self.values).
    Outside the range of the breaks, the function is assumed to have gradient 1
    """
    def __init__(self, input_dim, output_dim, values, breaks, name='piecewise_linear'):
        assert input_dim==1
        assert output_dim==1
        Mapping.__init__(self, input_dim, output_dim, name)
        values, breaks = np.array(values).flatten(), np.array(breaks).flatten()
        assert values.size == breaks.size
        self.values = Param('values', values)
        self.breaks = Param('breaks', breaks)
        self.link_parameter(self.values)
        self.link_parameter(self.breaks)
    def parameters_changed(self):
        self.order = np.argsort(self.breaks)*1
        self.reverse_order = np.zeros_like(self.order)
        self.reverse_order[self.order] = np.arange(self.order.size)
        self.sorted_breaks = self.breaks[self.order]
        self.sorted_values = self.values[self.order]
        self.grads = np.diff(self.sorted_values)/np.diff(self.sorted_breaks)
    def f(self, X):
        x = X.flatten()
        y = x.copy()
        #first adjus the points below the first value
        y[x<self.sorted_breaks[0]]  = x[x<self.sorted_breaks[0]] + self.sorted_values[0] - self.sorted_breaks[0]
        #now all the points pas the last break
        y[x>self.sorted_breaks[-1]]  = x[x>self.sorted_breaks[-1]] + self.sorted_values[-1] - self.sorted_breaks[-1]
        #loop throught the pairs of points
        for low, up, g, v in zip(self. sorted_breaks[:-1], self.sorted_breaks[1:], self.grads, self.sorted_values[:-1]):
            i = np.logical_and(x>low, x<up)
            y[i] = v + (x[i]-low)*g
        return y.reshape(-1,1)
    def update_gradients(self, dL_dF, X):
        x = X.flatten()
        dL_dF = dL_dF.flatten()
        dL_db = np.zeros(self.sorted_breaks.size)
        dL_dv = np.zeros(self.sorted_values.size)
        #loop across each interval, computing the gradient for each of the 4 parameters that define it
        for i, (low, up, g, v) in enumerate(zip(self. sorted_breaks[:-1], self.sorted_breaks[1:], self.grads, self.sorted_values[:-1])):
            index = np.logical_and(x>low, x<up)
            xx = x[index]
            grad = dL_dF[index]
            span = up-low
            dL_dv[i] += np.sum(grad*( (low - xx)/span + 1))
            dL_dv[i+1] += np.sum(grad*(xx-low)/span)
            dL_db[i] += np.sum(grad*g*(xx-up)/span)
            dL_db[i+1] += np.sum(grad*g*(low-xx)/span)
        #now the end parts
        dL_db[0] -= np.sum(dL_dF[x<self.sorted_breaks[0]])
        dL_db[-1] -= np.sum(dL_dF[x>self.sorted_breaks[-1]])
        dL_dv[0] += np.sum(dL_dF[x<self.sorted_breaks[0]])
        dL_dv[-1] += np.sum(dL_dF[x>self.sorted_breaks[-1]])
        #now put the gradients back in the correct order!
        self.breaks.gradient = dL_db[self.reverse_order]
        self.values.gradient = dL_dv[self.reverse_order]
    def gradients_X(self, dL_dF, X):
        x = X.flatten()
        #outside the range of the breakpoints, the function is just offset by a contant, so the partial derivative is 1.
        dL_dX = dL_dF.copy().flatten()
        #insude the breakpoints, the partial derivative is self.grads
        for low, up, g, v in zip(self. sorted_breaks[:-1], self.sorted_breaks[1:], self.grads, self.sorted_values[:-1]):
            i = np.logical_and(x>low, x<up)
            dL_dX[i] = dL_dF[i]*g
        return dL_dX.reshape(-1,1)