From 128ebbabc5f99365f3ac7d9490c2a271854f064f Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Thu, 29 Aug 2013 19:26:00 +0200
Subject: [PATCH] Added mlp mapping (with possibility of having multiple
 layers).

---
 GPy/core/gp.py                                |   1 -
 GPy/core/mapping.py                           |   5 +-
 GPy/examples/dimensionality_reduction.py      |   2 +-
 GPy/kern/constructors.py                      |  38 +++++-
 GPy/kern/parts/__init__.py                    |   1 +
 GPy/mappings/__init__.py                      |   8 +-
 GPy/mappings/{kernel_mapping.py => kernel.py} |   6 +-
 GPy/mappings/{linear_mapping.py => linear.py} |   5 +-
 GPy/mappings/mlp.py                           | 126 ++++++++++++++++++
 GPy/models/gplvm.py                           |   2 +
 GPy/notes.txt                                 |   4 +
 GPy/testing/mapping_tests.py                  |   6 +
 12 files changed, 187 insertions(+), 17 deletions(-)
 rename GPy/mappings/{kernel_mapping.py => kernel.py} (94%)
 rename GPy/mappings/{linear_mapping.py => linear.py} (94%)
 create mode 100644 GPy/mappings/mlp.py

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index faa77e80..278ddc74 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -6,7 +6,6 @@ import numpy as np
 import pylab as pb
 from .. import kern
 from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs
-# from ..util.plot import gpplot,  Tango
 from ..likelihoods import EP
 from gp_base import GPBase
 
diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py
index 074a30fc..02b9664a 100644
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@@ -1,11 +1,10 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-import numpy as np
-from .. import kern
 from ..util.plot import Tango, x_frame1D, x_frame2D
+from parameterized import Parameterized
+import numpy as np
 import pylab as pb
-from GPy.core.parameterized import Parameterized
 
 class Mapping(Parameterized):
     """
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 09cc9abe..994e3b48 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -397,7 +397,7 @@ def bcgplvm_linear_stick(kernel=None):
 def bcgplvm_stick(kernel=None):
     data = GPy.util.datasets.osu_run1()
     # optimize
-    back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=10.)
+    back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=5.)
     mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
     m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
     m.optimize(messages=1, max_f_eval=10000)
diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index 03cf43c1..025867a9 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -69,6 +69,40 @@ def mlp(input_dim,variance=1., weight_variance=None,bias_variance=100.,ARD=False
     part = parts.mlp.MLP(input_dim,variance,weight_variance,bias_variance,ARD)
     return kern(input_dim, [part])
 
+# def gibbs(input_dim,variance=1., mapping=None):
+#     """
+#     Gibbs and MacKay non-stationary covariance function.
+
+#     .. math::
+       
+#        r = sqrt((x_i - x_j)'*(x_i - x_j))
+       
+#        k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
+
+#        Z = \sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
+
+#        where :math:`l(x)` is a function giving the length scale as a function of space.
+#        This is the non stationary kernel proposed by Mark Gibbs in his 1997
+#         thesis. It is similar to an RBF but has a length scale that varies
+#         with input location. This leads to an additional term in front of
+#         the kernel.
+
+#         The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
+
+#         :param input_dim: the number of input dimensions
+#         :type input_dim: int 
+#         :param variance: the variance :math:`\sigma^2`
+#         :type variance: float
+#         :param mapping: the mapping that gives the lengthscale across the input space.
+#         :type mapping: GPy.core.Mapping
+#         :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+#         :type ARD: Boolean
+#         :rtype: Kernpart object
+
+#     """
+#     part = parts.gibbs.Gibbs(input_dim,variance,mapping)
+#     return kern(input_dim, [part])
+
 def poly(input_dim,variance=1., weight_variance=None,bias_variance=1.,degree=2, ARD=False):
     """
     Construct a polynomial kernel
@@ -312,10 +346,10 @@ def coregionalise(output_dim, rank=1, W=None, kappa=None):
 
     Used for computing covariance functions of the form
     .. math::
-       k_2(x, y)=B k(x, y)
+       k_2(x, y)=\mathbf{B} k(x, y)
     where
     .. math::
-       B = WW^\top + kappa I..
+       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
 
     :param output_dim: the number of output dimensions
     :type output_dim: int
diff --git a/GPy/kern/parts/__init__.py b/GPy/kern/parts/__init__.py
index cf8df575..053e280f 100644
--- a/GPy/kern/parts/__init__.py
+++ b/GPy/kern/parts/__init__.py
@@ -4,6 +4,7 @@ import coregionalise
 import exponential
 import finite_dimensional
 import fixed
+import gibbs
 import independent_outputs
 import linear
 import Matern32
diff --git a/GPy/mappings/__init__.py b/GPy/mappings/__init__.py
index d1383c18..b0d90ba2 100644
--- a/GPy/mappings/__init__.py
+++ b/GPy/mappings/__init__.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kernel_mapping import Kernel
-from linear_mapping import Linear
-#from mlp_mapping import MLP
-#from rbf_mapping import RBF
+#from kernel import Kernel
+from linear import Linear
+from mlp import MLP
+#from rbf import RBF
diff --git a/GPy/mappings/kernel_mapping.py b/GPy/mappings/kernel.py
similarity index 94%
rename from GPy/mappings/kernel_mapping.py
rename to GPy/mappings/kernel.py
index 6ea4322f..a794e561 100644
--- a/GPy/mappings/kernel_mapping.py
+++ b/GPy/mappings/kernel.py
@@ -2,8 +2,8 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ..core import Mapping
-from .. import kern
+from ..core.mapping import Mapping
+from ..kern.kern import kern
 
 class Kernel(Mapping):
     """
@@ -42,7 +42,7 @@ class Kernel(Mapping):
 
     def _set_params(self, x):
         self.A = x[:self.num_data * self.output_dim].reshape(self.num_data, self.output_dim).copy()
-        self.bias = x[self.num_data*self.output_dim:]
+        self.bias = x[self.num_data*self.output_dim:].copy()
         
     def randomize(self):
         self.A = np.random.randn(self.num_data, self.output_dim)/np.sqrt(self.num_data+1)
diff --git a/GPy/mappings/linear_mapping.py b/GPy/mappings/linear.py
similarity index 94%
rename from GPy/mappings/linear_mapping.py
rename to GPy/mappings/linear.py
index 969cdd82..5f6aaa49 100644
--- a/GPy/mappings/linear_mapping.py
+++ b/GPy/mappings/linear.py
@@ -2,8 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ..core import Mapping
-from .. import kern
+from ..core.mapping import Mapping
 
 class Linear(Mapping):
     """
@@ -35,7 +34,7 @@ class Linear(Mapping):
 
     def _set_params(self, x):
         self.W = x[:self.input_dim * self.output_dim].reshape(self.input_dim, self.output_dim).copy()
-        self.bias = x[self.input_dim*self.output_dim:]
+        self.bias = x[self.input_dim*self.output_dim:].copy()
     def randomize(self):
         self.W = np.random.randn(self.input_dim, self.output_dim)/np.sqrt(self.input_dim + 1)
         self.bias = np.random.randn(self.output_dim)/np.sqrt(self.input_dim + 1)
diff --git a/GPy/mappings/mlp.py b/GPy/mappings/mlp.py
new file mode 100644
index 00000000..ce0e04b0
--- /dev/null
+++ b/GPy/mappings/mlp.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..core.mapping import Mapping
+
+class MLP(Mapping):
+    """
+    Mapping based on a multi-layer perceptron neural network model.
+
+    .. math::
+
+       f(\mathbf{x}*) = \mathbf{W}^0\boldsymbol{\phi}(\mathbf{W}^1\mathbf{x}+\mathb{b}^1)^* + \mathbf{b}^0
+
+    where
+    ..math::
+      \phi(\cdot) = \text{tanh}(\cdot)
+
+    :param X: input observations
+    :type X: ndarray
+    :param output_dim: dimension of output.
+    :type output_dim: int
+    :param hidden_dim: dimension of hidden layer. If it is an int, there is one hidden layer of the given dimension. If it is a list of ints there are as manny hidden layers as the length of the list, each with the given number of hidden nodes in it.
+    :type hidden_dim: int or list of ints. 
+    """
+
+    def __init__(self, input_dim=1, output_dim=1, hidden_dim=3):
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
+        if isinstance(hidden_dim, int):
+            hidden_dim = [hidden_dim]
+        self.hidden_dim = hidden_dim
+        self.activation = [None]*len(self.hidden_dim)
+        self.W = []
+        self._dL_dW = []
+        self.bias = []
+        self._dL_dbias = []
+        self.W.append(np.zeros((self.input_dim, self.hidden_dim[0])))
+        self._dL_dW.append(np.zeros((self.input_dim, self.hidden_dim[0])))
+        self.bias.append(np.zeros(self.hidden_dim[0]))
+        self._dL_dbias.append(np.zeros(self.hidden_dim[0]))
+        self.num_params = self.hidden_dim[0]*(self.input_dim+1)
+        for h1, h0 in zip(hidden_dim[1:], hidden_dim[0:-1]):
+            self.W.append(np.zeros((h0, h1)))
+            self._dL_dW.append(np.zeros((h0, h1)))
+            self.bias.append(np.zeros(h1))
+            self._dL_dbias.append(np.zeros(h1))
+            self.num_params += h1*(h0+1)
+        self.W.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
+        self._dL_dW.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
+        self.bias.append(np.zeros(self.output_dim))
+        self._dL_dbias.append(np.zeros(self.output_dim))
+        self.num_params += self.output_dim*(self.hidden_dim[-1]+1)
+        self.randomize()
+
+    def _get_param_names(self):
+        return sum([['W%i_%i_%i' % (i, n, d)  for n in range(self.W[i].shape[0]) for d in range(self.W[i].shape[1])] + ['bias%i_%i' % (i, d) for d in range(self.W[i].shape[1])] for i in range(len(self.W))], [])
+
+    def _get_params(self):
+        param = np.array([])
+        for W, bias in zip(self.W, self.bias):
+            param = np.hstack((param, W.flatten(), bias))
+        return param
+    
+    def _set_params(self, x):
+        start = 0
+        for W, bias in zip(self.W, self.bias):
+            end = W.shape[0]*W.shape[1]+start
+            W[:] = x[start:end].reshape(W.shape[0], W.shape[1]).copy()
+            start = end
+            end = W.shape[1]+end
+            bias[:] = x[start:end].copy()
+            start = end
+
+    def randomize(self):
+        for W, bias in zip(self.W, self.bias):
+            W[:] = np.random.randn(W.shape[0], W.shape[1])/np.sqrt(W.shape[0]+1)
+            bias[:] = np.random.randn(W.shape[1])/np.sqrt(W.shape[0]+1)
+
+    def f(self, X):
+        self._f_computations(X)
+        return np.dot(np.tanh(self.activation[-1]), self.W[-1]) + self.bias[-1]
+
+    def _f_computations(self, X):
+        W = self.W[0]
+        bias = self.bias[0]
+        self.activation[0] = np.dot(X,W) + bias
+        for W, bias, index in zip(self.W[1:-1], self.bias[1:-1], range(1, len(self.activation))):
+            self.activation[index] = np.dot(np.tanh(self.activation[index-1]), W)+bias
+
+    def df_dtheta(self, dL_df, X):
+        self._df_computations(dL_df, X)
+        for gW, gbias in zip(self._dL_dW, self._dL_dbias):
+            g = np.hstack((g, gW.flatten(), gbias))
+        return g
+
+    def _df_computations(self, dL_df, X):
+        self._f_computations(X)
+        a0 = self.activation[-1]
+        W = self.W[-1]
+        self._dL_dW[-1] = (dL_df[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
+        dL_dta=(dL_df[:, None, :]*W[None, :, :]).sum(2)
+        self._dL_dbias[-1] = (dL_df.sum(0))
+        for dL_dW, dL_dbias, W, bias, a0, a1 in zip(self._dL_dW[-2:0:-1],
+                                                    self._dL_dbias[-2:0:-1],
+                                                    self.W[-2:0:-1],
+                                                    self.bias[-2:0:-1],
+                                                    self.activation[-2::-1],
+                                                    self.activation[-1:0:-1]):
+            ta = np.tanh(a1)
+            dL_da = dL_dta*(1-ta*ta)
+            dL_dW[:] = (dL_da[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
+            dL_dbias[:] = (dL_da.sum(0))
+            dL_dta = (dL_da[:, None, :]*W[None, :, :]).sum(2)
+        ta = np.tanh(self.activation[0])
+        dL_da = dL_dta*(1-ta*ta)
+        W = self.W[0]
+        self._dL_dW[0] = (dL_da[:, :, None]*X[:, None, :]).sum(0).T
+        self._dL_dbias[0] = (dL_da.sum(0))
+        self._dL_dX = (dL_da[:, None, :]*W[None, :, :]).sum(2)
+        g = np.array([])
+
+        
+    def df_dX(self, dL_df, X):
+        self._df_computations(dL_df, X)
+        return self._dL_dX
+    
diff --git a/GPy/models/gplvm.py b/GPy/models/gplvm.py
index 292b3da4..c2a7768c 100644
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@@ -8,6 +8,7 @@ import sys, pdb
 from .. import kern
 from ..core import Model
 from ..util.linalg import pdinv, PCA
+from ..core.priors import Gaussian as Gaussian_prior
 from ..core import GP
 from ..likelihoods import Gaussian
 from .. import util
@@ -33,6 +34,7 @@ class GPLVM(GP):
             kernel = kern.rbf(input_dim, ARD=input_dim > 1) + kern.bias(input_dim, np.exp(-2))
         likelihood = Gaussian(Y, normalize=normalize_Y, variance=np.exp(-2.))
         GP.__init__(self, X, likelihood, kernel, normalize_X=False)
+        self.set_prior('.*X', Gaussian_prior(0, 1))
         self.ensure_default_constraints()
 
     def initialise_latent(self, init, input_dim, Y):
diff --git a/GPy/notes.txt b/GPy/notes.txt
index 530b7036..c8c184a1 100644
--- a/GPy/notes.txt
+++ b/GPy/notes.txt
@@ -62,3 +62,7 @@ For some reason a stub of _get_param_names(self) wasn't available in the Paramet
 Is there a quick FAQ or something on how to build the documentation? I did it once, but can't remember!
 
 Similar for the nosetests ... even ran them last week but can't remember the command!
+
+Now added Gaussian priors to GPLVM latent variables by default. When running the GPy.examples.dimensionality_reduction.stick() example the print out from print model has the same value for the prior+likelihood as for the prior.
+
+For the back constrained GP-LVM need priors to be on the Xs not on the model parameters (because they aren't parameters, they are constraints). Need to work out how to do this.
diff --git a/GPy/testing/mapping_tests.py b/GPy/testing/mapping_tests.py
index 578c3dac..cd28e71a 100644
--- a/GPy/testing/mapping_tests.py
+++ b/GPy/testing/mapping_tests.py
@@ -21,6 +21,12 @@ class MappingTests(unittest.TestCase):
         self.assertTrue(GPy.core.Mapping_check_df_dtheta(mapping=mapping).checkgrad(verbose=verbose))
         self.assertTrue(GPy.core.Mapping_check_df_dX(mapping=mapping).checkgrad(verbose=verbose))
 
+    def test_mlpmapping(self):
+        verbose = False
+        mapping = GPy.mappings.MLP(input_dim=2, hidden_dim=[3, 4, 8, 2], output_dim=2)        
+        self.assertTrue(GPy.core.Mapping_check_df_dtheta(mapping=mapping).checkgrad(verbose=verbose))
+        self.assertTrue(GPy.core.Mapping_check_df_dX(mapping=mapping).checkgrad(verbose=verbose))
+
 
        
 if __name__ == "__main__":