lots of changes to mappings

2026-07-20 16:51:05 +02:00 · 2015-03-23 14:24:32 +00:00 · 2015-03-23 14:24:32 +00:00 · 9976e56bea
commit 9976e56bea
parent 99545500b1
7 changed files with 202 additions and 238 deletions
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -1,4 +1,5 @@
 # Copyright (c) 2013,2014, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import sys
@ -7,7 +8,7 @@ import numpy as np

 class Mapping(Parameterized):
    """
-    Base model for shared behavior between models that can act like a mapping.
+    Base model for shared mapping behaviours
    """

    def __init__(self, input_dim, output_dim, name='mapping'):
@ -18,49 +19,12 @@ class Mapping(Parameterized):
    def f(self, X):
        raise NotImplementedError

-    def df_dX(self, dL_df, X):
-        """Evaluate derivatives of mapping outputs with respect to inputs.
-
-        :param dL_df: gradient of the objective with respect to the function.
-        :type dL_df: ndarray (num_data x output_dim)
-        :param X: the input locations where derivatives are to be evaluated.
-        :type X: ndarray (num_data x input_dim)
-        :returns: matrix containing gradients of the function with respect to the inputs.
-        """
+    def gradients_X(self, dL_dF, X):
        raise NotImplementedError

-    def df_dtheta(self, dL_df, X):
-        """The gradient of the outputs of the mapping with respect to each of the parameters.
-
-        :param dL_df: gradient of the objective with respect to the function.
-        :type dL_df: ndarray (num_data x output_dim)
-        :param X: input locations where the function is evaluated.
-        :type X: ndarray (num_data x input_dim)
-        :returns: Matrix containing gradients with respect to parameters of each output for each input data.
-        :rtype: ndarray (num_params length)
-        """
-
+    def update_gradients(self, dL_dF, X):
        raise NotImplementedError

-    def plot(self, *args):
-        """
-        Plots the mapping associated with the model.
-          - In one dimension, the function is plotted.
-          - In two dimensions, a contour-plot shows the function
-          - In higher dimensions, we've not implemented this yet !TODO!
-
-        Can plot only part of the data and part of the posterior functions
-        using which_data and which_functions
-
-        This is a convenience function: arguments are passed to
-        GPy.plotting.matplot_dep.models_plots.plot_mapping
-        """
-
-        if "matplotlib" in sys.modules:
-            from ..plotting.matplot_dep import models_plots
-            mapping_plots.plot_mapping(self,*args)
-        else:
-            raise NameError, "matplotlib package has not been imported."

 class Bijective_mapping(Mapping):
    """
--- a/GPy/mappings/additive.py
+++ b/GPy/mappings/additive.py
@ -17,45 +17,25 @@ class Additive(Mapping):
    :type mapping1: GPy.mappings.Mapping
    :param mapping2: second mapping to add together.
    :type mapping2: GPy.mappings.Mapping
-    :param tensor: whether or not to use the tensor product of input spaces
-    :type tensor: bool

    """

-    def __init__(self, mapping1, mapping2, tensor=False):
-        if tensor:
-            input_dim = mapping1.input_dim + mapping2.input_dim
-        else:
-            input_dim = mapping1.input_dim
-            assert(mapping1.input_dim==mapping2.input_dim)
+    def __init__(self, mapping1, mapping2):
+        assert(mapping1.input_dim==mapping2.input_dim)
        assert(mapping1.output_dim==mapping2.output_dim)
-        output_dim = mapping1.output_dim
+        input_dim, output_dim = mapping1.input_dim, mapping1.output_dim
        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
        self.mapping1 = mapping1
        self.mapping2 = mapping2
        self.num_params = self.mapping1.num_params + self.mapping2.num_params
        self.name = self.mapping1.name + '+' + self.mapping2.name
-    def _get_param_names(self):
-        return self.mapping1._get_param_names + self.mapping2._get_param_names
-
-    def _get_params(self):
-        return np.hstack((self.mapping1._get_params(), self.mapping2._get_params()))
-
-    def _set_params(self, x):
-        self.mapping1._set_params(x[:self.mapping1.num_params])
-        self.mapping2._set_params(x[self.mapping1.num_params:])
-        
-    def randomize(self):
-        self.mapping1._randomize()
-        self.mapping2._randomize()

    def f(self, X):
        return self.mapping1.f(X) + self.mapping2.f(X)

-    def df_dtheta(self, dL_df, X):
-        self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T
-        self._df_dbias = (dL_df.sum(0))
-        return np.hstack((self._df_dA.flatten(), self._df_dbias))
+    def update_gradients(self, dL_dF, X):
+        self.mapping1.update_gradients(dL_dF, X)
+        self.mapping2.update_gradients(dL_dF, X)

-    def df_dX(self, dL_df, X):
-        return self.kern.dK_dX((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X) 
+    def gradients_X(self, dL_dF, X):
+        return self.mapping1.gradients_X(dL_dF, X) + self.mapping2.gradients_X(dL_dF, X)
--- a/GPy/mappings/identity.py
+++ b/GPy/mappings/identity.py
@ -0,0 +1,26 @@
+# Copyright (c) 2015, James Hensman
+
+from ..core.mapping import Mapping
+from ..core import Param
+
+class Identity(Mapping):
+    """
+    A mapping that does nothing!
+    """
+    def __init__(self, input_dim, output_dim, name='identity'):
+        Mapping.__init__(self, input_dim, output_dim, name)
+
+    def f(self, X):
+        return X
+
+    def update_gradients(self, dL_dF, X):
+        pass
+
+    def gradients_X(self, dL_dF, X):
+        return dL_dF
+
+
+
+
+
+
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@ -1,9 +1,10 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
 from ..core.mapping import Mapping
-import GPy
+from ..core import Param

 class Kernel(Mapping):
    """
@ -11,50 +12,41 @@ class Kernel(Mapping):

    .. math::

-       f(\mathbf{x}*) = \mathbf{A}\mathbf{k}(\mathbf{X}, \mathbf{x}^*) + \mathbf{b}
+       f(\mathbf{x}) = \sum_i \alpha_i k(\mathbf{z}_i, \mathbf{x})

-    :param X: input observations containing :math:`\mathbf{X}`
-    :type X: ndarray
+    or for multple outputs
+
+    .. math::
+
+       f_i(\mathbf{x}) = \sum_j \alpha_{i,j} k(\mathbf{z}_i, \mathbf{x})
+
+
+    :param input_dim: dimension of input.
+    :type input_dim: int
    :param output_dim: dimension of output.
    :type output_dim: int
+    :param Z: input observations containing :math:`\mathbf{Z}`
+    :type Z: ndarray
    :param kernel: a GPy kernel, defaults to GPy.kern.RBF
    :type kernel: GPy.kern.kern

    """

-    def __init__(self, X, output_dim=1, kernel=None):
-        Mapping.__init__(self, input_dim=X.shape[1], output_dim=output_dim)
-        if kernel is None:
-            kernel = GPy.kern.RBF(self.input_dim)
+    def __init__(self, input_dim, output_dim, Z, kernel, name='kernmap'):
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
        self.kern = kernel
-        self.X = X
-        self.num_data = X.shape[0]
-        self.num_params = self.output_dim*(self.num_data + 1)
-        self.A = np.array((self.num_data, self.output_dim))
-        self.bias = np.array(self.output_dim)
-        self.randomize()
-        self.name = 'kernel'
-    def _get_param_names(self):
-        return sum([['A_%i_%i' % (n, d) for d in range(self.output_dim)] for n in range(self.num_data)], []) + ['bias_%i' % d for d in range(self.output_dim)]
-
-    def _get_params(self):
-        return np.hstack((self.A.flatten(), self.bias))
-
-    def _set_params(self, x):
-        self.A = x[:self.num_data * self.output_dim].reshape(self.num_data, self.output_dim).copy()
-        self.bias = x[self.num_data*self.output_dim:].copy()
-
-    def randomize(self):
-        self.A = np.random.randn(self.num_data, self.output_dim)/np.sqrt(self.num_data+1)
-        self.bias = np.random.randn(self.output_dim)/np.sqrt(self.num_data+1)
+        self.Z = Z
+        self.num_bases, Zdim = X.shape
+        assert Zdim == self.input_dim
+        self.A = GPy.core.Param('A', np.random.randn(self.num_bases, self.output_dim))
+        self.add_parameter(self.A)

    def f(self, X):
-        return np.dot(self.kern.K(X, self.X),self.A) + self.bias
+        return np.dot(self.kern.K(X, self.Z), self.A)

-    def df_dtheta(self, dL_df, X):
-        self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T
-        self._df_dbias = (dL_df.sum(0))
-        return np.hstack((self._df_dA.flatten(), self._df_dbias))
+    def update_gradients(self, dL_dF, X):
+        self.kern.update_gradients_full(np.dot(dL_dF, self.A.T))
+        self.A.gradient = np.dot( self.kern.K(self.Z, X), dL_dF)

-    def df_dX(self, dL_df, X):
-        return self.kern.gradients_X((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X)
+    def gradients_X(self, dL_dF, X):
+        return self.kern.gradients_X(np.dot(dL_dF, self.A.T), X, self.Z)
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@ -1,43 +1,39 @@
 # Copyright (c) 2013, 2014 GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from ..core.mapping import Bijective_mapping
+from ..core.mapping import Mapping
 from ..core.parameterization import Param

-class Linear(Bijective_mapping):
+class Linear(Mapping):
    """
-    Mapping based on a linear model.
+    A Linear mapping.

    .. math::

-       f(\mathbf{x}*) = \mathbf{W}\mathbf{x}^* + \mathbf{b}
+       F(\mathbf{x}) = \mathbf{A} \mathbf{x})

-    :param X: input observations
-    :type X: ndarray
+
+    :param input_dim: dimension of input.
+    :type input_dim: int
    :param output_dim: dimension of output.
    :type output_dim: int
+    :param kernel: a GPy kernel, defaults to GPy.kern.RBF
+    :type kernel: GPy.kern.kern

    """

-    def __init__(self, input_dim=1, output_dim=1, name='linear'):
-        Bijective_mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
-        self.W = Param('W',np.array((self.input_dim, self.output_dim)))
-        self.bias = Param('bias',np.array(self.output_dim))
-        self.link_parameters(self.W, self.bias)
+    def __init__(self, input_dim, output_dim, name='linmap'):
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
+        self.A = GPy.core.Param('A', np.random.randn(self.input_dim, self.output_dim))
+        self.add_parameter(self.A)

    def f(self, X):
-        return np.dot(X,self.W) + self.bias
+        return np.dot(X, self.A)

-    def g(self, f):
-        V = np.linalg.solve(np.dot(self.W.T, self.W), W.T)
-        return np.dot(f-self.bias, V)  
+    def update_gradients(self, dL_dF, X):
+        self.A.gradient = np.dot( X.T dL_dF)

-    def df_dtheta(self, dL_df, X):
-        df_dW = (dL_df[:, :, None]*X[:, None, :]).sum(0).T
-        df_dbias = (dL_df.sum(0))
-        return np.hstack((df_dW.flatten(), df_dbias))
-
-    def dL_dX(self, partial, X):
-        """The gradient of L with respect to the inputs to the mapping, where L is a function that is dependent on the output of the mapping, f."""
-        return (partial[:, None, :]*self.W[None, :, :]).sum(2)
+    def gradients_X(self, dL_dF, X):
+        return np.dot(dL_dF, self.A.T)
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@ -3,128 +3,40 @@

 import numpy as np
 from ..core.mapping import Mapping
+from ..core import Param

 class MLP(Mapping):
    """
-    Mapping based on a multi-layer perceptron neural network model.
-
-    .. math::
-
-       f(\\mathbf{x}*) = \\mathbf{W}^0\\boldsymbol{\\phi}(\\mathbf{W}^1\\mathbf{x}+\\mathbf{b}^1)^* + \\mathbf{b}^0
-
-    where
-
-    .. math::
-
-      \\phi(\\cdot) = \\text{tanh}(\\cdot)
-
-    :param X: input observations
-    :type X: ndarray
-    :param output_dim: dimension of output.
-    :type output_dim: int
-    :param hidden_dim: dimension of hidden layer. If it is an int, there is one hidden layer of the given dimension. If it is a list of ints there are as manny hidden layers as the length of the list, each with the given number of hidden nodes in it.
-    :type hidden_dim: int or list of ints. 
-
+    Mapping based on a multi-layer perceptron neural network model, with a single hidden layer
    """

-    def __init__(self, input_dim=1, output_dim=1, hidden_dim=3):
-        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
-        self.name = 'mlp'
-        if isinstance(hidden_dim, int):
-            hidden_dim = [hidden_dim]
+    def __init__(self, input_dim=1, output_dim=1, hidden_dim=3, name='mlpmap'):
+        super(MLP).__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
        self.hidden_dim = hidden_dim
-        self.activation = [None]*len(self.hidden_dim)
-        self.W = []
-        self._dL_dW = []
-        self.bias = []
-        self._dL_dbias = []
-        self.W.append(np.zeros((self.input_dim, self.hidden_dim[0])))
-        self._dL_dW.append(np.zeros((self.input_dim, self.hidden_dim[0])))
-        self.bias.append(np.zeros(self.hidden_dim[0]))
-        self._dL_dbias.append(np.zeros(self.hidden_dim[0]))
-        self.num_params = self.hidden_dim[0]*(self.input_dim+1)
-        for h1, h0 in zip(hidden_dim[1:], hidden_dim[0:-1]):
-            self.W.append(np.zeros((h0, h1)))
-            self._dL_dW.append(np.zeros((h0, h1)))
-            self.bias.append(np.zeros(h1))
-            self._dL_dbias.append(np.zeros(h1))
-            self.num_params += h1*(h0+1)
-        self.W.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
-        self._dL_dW.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
-        self.bias.append(np.zeros(self.output_dim))
-        self._dL_dbias.append(np.zeros(self.output_dim))
-        self.num_params += self.output_dim*(self.hidden_dim[-1]+1)
-        self.randomize()
+        self.W1 = Param('W1', np.random.randn(self.input_dim, self.hidden_dim))
+        self.b1 = Param('b1', np.random.randn(self.hidden_dim))
+        self.W2 = Param('W2', np.random.randn(self.hidden_dim, self.output_dim))
+        self.b2 = Param('b2', np.random.randn(self.output_dim))

-    def _get_param_names(self):
-        return sum([['W%i_%i_%i' % (i, n, d)  for n in range(self.W[i].shape[0]) for d in range(self.W[i].shape[1])] + ['bias%i_%i' % (i, d) for d in range(self.W[i].shape[1])] for i in range(len(self.W))], [])
-
-    def _get_params(self):
-        param = np.array([])
-        for W, bias in zip(self.W, self.bias):
-            param = np.hstack((param, W.flatten(), bias))
-        return param
-    
-    def _set_params(self, x):
-        start = 0
-        for W, bias in zip(self.W, self.bias):
-            end = W.shape[0]*W.shape[1]+start
-            W[:] = x[start:end].reshape(W.shape[0], W.shape[1]).copy()
-            start = end
-            end = W.shape[1]+end
-            bias[:] = x[start:end].copy()
-            start = end
-
-    def randomize(self):
-        for W, bias in zip(self.W, self.bias):
-            W[:] = np.random.randn(W.shape[0], W.shape[1])/np.sqrt(W.shape[0]+1)
-            bias[:] = np.random.randn(W.shape[1])/np.sqrt(W.shape[0]+1)

    def f(self, X):
-        self._f_computations(X)
-        return np.dot(np.tanh(self.activation[-1]), self.W[-1]) + self.bias[-1]
+        N, D = X.shape
+        activations = np.tanh(np.dot(X,self.W1) + self.b1)
+        self.out = np.dot(self.activations,self.W2) + self.b2
+        return self.output_fn(self.out)

-    def _f_computations(self, X):
-        W = self.W[0]
-        bias = self.bias[0]
-        self.activation[0] = np.dot(X,W) + bias
-        for W, bias, index in zip(self.W[1:-1], self.bias[1:-1], range(1, len(self.activation))):
-            self.activation[index] = np.dot(np.tanh(self.activation[index-1]), W)+bias
+    def update_gradients(self, dL_dF, X):
+        activations = np.tanh(np.dot(X,self.W1) + self.b1)

-    def df_dtheta(self, dL_df, X):
-        self._df_computations(dL_df, X)
-        g = np.array([])
-        for gW, gbias in zip(self._dL_dW, self._dL_dbias):
-            g = np.hstack((g, gW.flatten(), gbias))
-        return g

-    def _df_computations(self, dL_df, X):
-        self._f_computations(X)
-        a0 = self.activation[-1]
-        W = self.W[-1]
-        self._dL_dW[-1] = (dL_df[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
-        dL_dta=(dL_df[:, None, :]*W[None, :, :]).sum(2)
-        self._dL_dbias[-1] = (dL_df.sum(0))
-        for dL_dW, dL_dbias, W, bias, a0, a1 in zip(self._dL_dW[-2:0:-1],
-                                                    self._dL_dbias[-2:0:-1],
-                                                    self.W[-2:0:-1],
-                                                    self.bias[-2:0:-1],
-                                                    self.activation[-2::-1],
-                                                    self.activation[-1:0:-1]):
-            ta = np.tanh(a1)
-            dL_da = dL_dta*(1-ta*ta)
-            dL_dW[:] = (dL_da[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
-            dL_dbias[:] = (dL_da.sum(0))
-            dL_dta = (dL_da[:, None, :]*W[None, :, :]).sum(2)
-        ta = np.tanh(self.activation[0])
-        dL_da = dL_dta*(1-ta*ta)
-        W = self.W[0]
-        self._dL_dW[0] = (dL_da[:, :, None]*X[:, None, :]).sum(0).T
-        self._dL_dbias[0] = (dL_da.sum(0))
-        self._dL_dX = (dL_da[:, None, :]*W[None, :, :]).sum(2)
+        #Evaluate second-layer gradients.
+        self.W2.gradient = np.dot(activations.T, dL_dF)
+        self.b2.gradient = np.sum(dL_dF, 0)
+
+        # Backpropagation to hidden layer.
+        delta_hid = np.dot(dL_dF, self.W2.T) * (1.0 - activations**2)
+
+        # Finally, evaluate the first-layer gradients.
+        self.W1.gradients = np.dot(X.T,delta_hid)
+        self.b1.gradients = np.sum(delta_hid, 0)

-        
-    def df_dX(self, dL_df, X):
-        self._df_computations(dL_df, X)
-        return self._dL_dX
-    
--- a/GPy/mappings/piecewise_linear.py
+++ b/GPy/mappings/piecewise_linear.py
@ -0,0 +1,94 @@
+from GPy.core.mapping import Mapping
+from GPy.core import Param
+import numpy as np
+
+class PiecewiseLinear(Mapping):
+    """
+    A piecewise-linear mapping.
+
+    The parameters of this mapping are the positions and values of the function where it is broken (self.breaks, self.values).
+
+    Outside the range of the breaks, the function is assumed to have gradient 1
+    """
+    def __init__(self, input_dim, output_dim, values, breaks, name='piecewise_linear'):
+
+        assert input_dim==1
+        assert output_dim==1
+
+        Mapping.__init__(self, input_dim, output_dim, name)
+
+        values, breaks = np.array(values).flatten(), np.array(breaks).flatten()
+        assert values.size == breaks.size
+        self.values = Param('values', values)
+        self.breaks = Param('breaks', breaks)
+        self.link_parameter(self.values)
+        self.link_parameter(self.breaks)
+
+    def parameters_changed(self):
+        self.order = np.argsort(self.breaks)*1
+        self.reverse_order = np.zeros_like(self.order)
+        self.reverse_order[self.order] = np.arange(self.order.size)
+
+        self.sorted_breaks = self.breaks[self.order]
+        self.sorted_values = self.values[self.order]
+
+        self.grads = np.diff(self.sorted_values)/np.diff(self.sorted_breaks)
+
+    def f(self, X):
+        x = X.flatten()
+        y = x.copy()
+
+        #first adjus the points below the first value
+        y[x<self.sorted_breaks[0]]  = x[x<self.sorted_breaks[0]] + self.sorted_values[0] - self.sorted_breaks[0]
+
+        #now all the points pas the last break
+        y[x>self.sorted_breaks[-1]]  = x[x>self.sorted_breaks[-1]] + self.sorted_values[-1] - self.sorted_breaks[-1]
+
+        #loop throught the pairs of points
+        for low, up, g, v in zip(self. sorted_breaks[:-1], self.sorted_breaks[1:], self.grads, self.sorted_values[:-1]):
+            i = np.logical_and(x>low, x<up)
+            y[i] = v + (x[i]-low)*g
+
+        return y.reshape(-1,1)
+
+    def update_gradients(self, dL_dF, X):
+        x = X.flatten()
+        dL_dF = dL_dF.flatten()
+
+        dL_db = np.zeros(self.sorted_breaks.size)
+        dL_dv = np.zeros(self.sorted_values.size)
+
+        #loop across each interval, computing the gradient for each of the 4 parameters that define it
+        for i, (low, up, g, v) in enumerate(zip(self. sorted_breaks[:-1], self.sorted_breaks[1:], self.grads, self.sorted_values[:-1])):
+            index = np.logical_and(x>low, x<up)
+            xx = x[index]
+            grad = dL_dF[index]
+            span = up-low
+            dL_dv[i] += np.sum(grad*( (low - xx)/span + 1))
+            dL_dv[i+1] += np.sum(grad*(xx-low)/span)
+            dL_db[i] += np.sum(grad*g*(xx-up)/span)
+            dL_db[i+1] += np.sum(grad*g*(low-xx)/span)
+
+        #now the end parts
+        dL_db[0] -= np.sum(dL_dF[x<self.sorted_breaks[0]])
+        dL_db[-1] -= np.sum(dL_dF[x>self.sorted_breaks[-1]])
+        dL_dv[0] += np.sum(dL_dF[x<self.sorted_breaks[0]])
+        dL_dv[-1] += np.sum(dL_dF[x>self.sorted_breaks[-1]])
+
+        #now put the gradients back in the correct order!
+        self.breaks.gradient = dL_db[self.reverse_order]
+        self.values.gradient = dL_dv[self.reverse_order]
+
+    def gradients_X(self, dL_dF, X):
+        x = X.flatten()
+
+        #outside the range of the breakpoints, the function is just offset by a contant, so the partial derivative is 1.
+        dL_dX = dL_dF.copy().flatten()
+
+        #insude the breakpoints, the partial derivative is self.grads
+        for low, up, g, v in zip(self. sorted_breaks[:-1], self.sorted_breaks[1:], self.grads, self.sorted_values[:-1]):
+            i = np.logical_and(x>low, x<up)
+            dL_dX[i] = dL_dF[i]*g
+
+        return dL_dX.reshape(-1,1)
+