From 1766db89feb9ef2bf049784d5357190325c5d663 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Mon, 24 Feb 2014 13:51:03 +0000
Subject: [PATCH] adding and producting in stationary is no stationary

---
 GPy/kern/__init__.py                 |   4 +-
 GPy/kern/_src/independent_outputs.py | 104 +++++++++++++++-----------
 GPy/kern/_src/kern.py                |   2 +-
 GPy/kern/_src/rbf.py                 |   2 +-
 GPy/kern/_src/static.py              |  27 +++----
 GPy/kern/_src/stationary.py          | 106 +++++++++++++++++++--------
 6 files changed, 154 insertions(+), 91 deletions(-)

diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index f91f5ac6..a1f57619 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -1,11 +1,12 @@
-from _src.rbf import RBF
 from _src.kern import Kern
+from _src.rbf import RBF
 from _src.linear import Linear
 from _src.static import Bias, White
 from _src.brownian import Brownian
 from _src.stationary import Exponential, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from _src.mlp import MLP
 from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
+from _src.independent_outputs import IndependentOutputs
 #import coregionalize
 #import eq_ode1
 #import finite_dimensional
@@ -13,7 +14,6 @@ from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern5
 #import gibbs
 #import hetero
 #import hierarchical
-#import independent_outputs
 #import ODE_1
 #import periodic_exponential
 #import periodic_Matern32
diff --git a/GPy/kern/_src/independent_outputs.py b/GPy/kern/_src/independent_outputs.py
index 98f1203d..6d3943ae 100644
--- a/GPy/kern/_src/independent_outputs.py
+++ b/GPy/kern/_src/independent_outputs.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-from kernpart import Kernpart
+from kern import Kern
 import numpy as np
 
 def index_to_slices(index):
@@ -31,67 +31,89 @@ def index_to_slices(index):
     [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
     return ret
 
-class IndependentOutputs(Kernpart):
+class IndependentOutputs(Kern):
     """
-    A kernel part shich can reopresent several independent functions.
+    A kernel which can reopresent several independent functions.
     this kernel 'switches off' parts of the matrix where the output indexes are different.
 
     The index of the functions is given by the last column in the input X
-    the rest of the columns of X are passed to the kernel for computation (in blocks).
+    the rest of the columns of X are passed to the underlying kernel for computation (in blocks).
 
     """
-    def __init__(self,k):
-        self.input_dim = k.input_dim + 1
-        self.num_params = k.num_params
-        self.name = 'iops('+ k.name + ')'
-        self.k = k
+    def __init__(self, kern, name='independ'):
+        super(IndependentOutputs, self).__init__(kern.input_dim+1, name)
+        self.kern = kern
+        self.add_parameters(self.kern)
 
-    def _get_params(self):
-        return self.k._get_params()
+    def K(self,X ,X2=None):
+        X, slices = X[:,:-1], index_to_slices(X[:,-1])
+        if X2 is None:
+            target = np.zeros((X.shape[0], X.shape[0]))
+            [[np.copyto(target[s,s], self.kern.K(X[s], None)) for s in slices_i] for slices_i in slices]
+        else:
+            X2, slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+            target = np.zeros((X.shape[0], X2.shape[0]))
+            [[[np.copyto(target[s, s2], self.kern.K(X[s],X2[s2])) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+        return target
 
-    def _set_params(self,x):
-        self.k._set_params(x)
-        self.params = x
+    def Kdiag(self,X):
+        X, slices = X[:,:-1], index_to_slices(X[:,-1])
+        target = np.zeros(X.shape[0])
+        [[np.copyto(target[s], self.kern.Kdiag(X[s])) for s in slices_i] for slices_i in slices]
+        return target
 
-    def _get_param_names(self):
-        return self.k._get_param_names()
+    def update_gradients_full(self,dL_dK,X,X2=None):
+        target = np.zeros(self.kern.size)
+        def collate_grads(dL, X, X2):
+            self.kern.update_gradients_full(dL,X,X2)
+            self.kern._collect_gradient(target)
 
-    def K(self,X,X2,target):
-        #Sort out the slices from the input data
         X,slices = X[:,:-1],index_to_slices(X[:,-1])
         if X2 is None:
-            X2,slices2 = X,slices
+            [[collate_grads(dL_dK[s,s], X[s], None) for s in slices_i] for slices_i in slices]
         else:
-            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+            X2, slices2 = X2[:,:-1], index_to_slices(X2[:,-1])
+            [[[collate_grads(dL_dK[s,s2],X[s],X2[s2]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
 
-        [[[self.k.K(X[s],X2[s2],target[s,s2]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+        self.kern._set_gradient(target)
 
-    def Kdiag(self,X,target):
-        X,slices = X[:,:-1],index_to_slices(X[:,-1])
-        [[self.k.Kdiag(X[s],target[s]) for s in slices_i] for slices_i in slices]
-
-    def _param_grad_helper(self,dL_dK,X,X2,target):
-        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+    def gradients_X(self,dL_dK, X, X2=None):
+        target = np.zeros_like(X)
+        X, slices = X[:,:-1],index_to_slices(X[:,-1])
         if X2 is None:
-            X2,slices2 = X,slices
+            [[np.copyto(target[s,:-1], self.kern.gradients_X(dL_dK[s,s],X[s],None)) for s in slices_i] for slices_i in slices]
         else:
             X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
-        [[[self.k._param_grad_helper(dL_dK[s,s2],X[s],X2[s2],target) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+            [[[np.copyto(target[s,:-1], self.kern.gradients_X(dL_dK[s,s2], X[s], X2[s2])) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+        return target
 
+    def gradients_X_diag(self, dL_dKdiag, X):
+        X, slices = X[:,:-1], index_to_slices(X[:,-1])
+        target = np.zeros(X.shape)
+        [[np.copyto(target[s,:-1], self.kern.gradients_X_diag(dL_dKdiag[s],X[s])) for s in slices_i] for slices_i in slices]
+        return target
 
-    def gradients_X(self,dL_dK,X,X2,target):
+    def update_gradients_diag(self,dL_dKdiag,X,target):
+        target = np.zeros(self.kern.size)
+        def collate_grads(dL, X):
+            self.kern.update_gradients_diag(dL,X)
+            self.kern._collect_gradient(target)
         X,slices = X[:,:-1],index_to_slices(X[:,-1])
-        if X2 is None:
-            X2,slices2 = X,slices
-        else:
-            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
-        [[[self.k.gradients_X(dL_dK[s,s2],X[s],X2[s2],target[s,:-1]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+        [[collate_grads(dL_dKdiag[s], X[s,:]) for s in slices_i] for slices_i in slices]
+        self.kern._set_gradient(target)
 
-    def dKdiag_dX(self,dL_dKdiag,X,target):
-        X,slices = X[:,:-1],index_to_slices(X[:,-1])
-        [[self.k.dKdiag_dX(dL_dKdiag[s],X[s],target[s,:-1]) for s in slices_i] for slices_i in slices]
+def Hierarchical(kern_f, kern_g, name='hierarchy'):
+    """
+    A kernel which can reopresent a simple hierarchical model.
 
+    See Hensman et al 2013, "Hierarchical Bayesian modelling of gene expression time
+    series across irregularly sampled replicates and clusters"
+    http://www.biomedcentral.com/1471-2105/14/252
+
+    The index of the functions is given by the last column in the input X
+    the rest of the columns of X are passed to the underlying kernel for computation (in blocks).
+
+    """
+    assert kern_f.input_dim == kern_g.input_dim
+    return kern_f + IndependentOutputs(kern_g)
 
-    def dKdiag_dtheta(self,dL_dKdiag,X,target):
-        X,slices = X[:,:-1],index_to_slices(X[:,-1])
-        [[self.k.dKdiag_dX(dL_dKdiag[s],X[s],target) for s in slices_i] for slices_i in slices]
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 3ef231b3..92b1b489 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -101,7 +101,7 @@ class Kern(Parameterized):
         """ Here we overload the '*' operator. See self.prod for more information"""
         return self.prod(other)
 
-    def __pow__(self, other, tensor=False):
+    def __pow__(self, other):
         """
         Shortcut for tensor `prod`.
         """
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 0c8588a2..fad905a5 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -128,7 +128,7 @@ class RBF(Kern):
 
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
         mu = posterior_variational.mean
-        S = posterior_variational.variance        
+        S = posterior_variational.variance
         self._psi_computations(Z, mu, S)
 
         #contributions from psi0:
diff --git a/GPy/kern/_src/static.py b/GPy/kern/_src/static.py
index 28854162..09ab0ded 100644
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@@ -8,6 +8,16 @@ from ...core.parameterization.transformations import Logexp
 import numpy as np
 
 class Static(Kern):
+    def __init__(self, input_dim, variance, name):
+        super(Static, self).__init__(input_dim, name)
+        self.variance = Param('variance', variance, Logexp())
+        self.add_parameters(self.variance)
+
+    def Kdiag(self, X):
+        ret = np.empty((X.shape[0],), dtype=np.float64)
+        ret[:] = self.variance
+        return ret
+
     def gradients_X(self, dL_dK, X, X2, target):
         return np.zeros(X.shape)
 
@@ -34,9 +44,6 @@ class Static(Kern):
 class White(Static):
     def __init__(self, input_dim, variance=1., name='white'):
         super(White, self).__init__(input_dim, name)
-        self.input_dim = input_dim
-        self.variance = Param('variance', variance, Logexp())
-        self.add_parameters(self.variance)
 
     def K(self, X, X2=None):
         if X2 is None:
@@ -44,11 +51,6 @@ class White(Static):
         else:
             return np.zeros((X.shape[0], X2.shape[0]))
 
-    def Kdiag(self, X):
-        ret = np.ones(X.shape[0])
-        ret[:] = self.variance
-        return ret
-
     def psi2(self, Z, mu, S, target):
         return np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]), dtype=np.float64)
 
@@ -63,10 +65,8 @@ class White(Static):
 
 
 class Bias(Static):
-    def __init__(self, input_dim, variance=1., name=None):
+    def __init__(self, input_dim, variance=1., name='bias'):
         super(Bias, self).__init__(input_dim, name)
-        self.variance = Param("variance", variance, Logexp())
-        self.add_parameter(self.variance)
 
     def K(self, X, X2=None):
         shape = (X.shape[0], X.shape[0] if X2 is None else X2.shape[0])
@@ -74,11 +74,6 @@ class Bias(Static):
         ret[:] = self.variance
         return ret
 
-    def Kdiag(self, X):
-        ret = np.empty((X.shape[0],), dtype=np.float64)
-        ret[:] = self.variance
-        return ret
-
     def update_gradients_full(self, dL_dK, X, X2=None):
         self.variance.gradient = dL_dK.sum()
 
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index 86db393a..dde26e63 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -32,6 +32,13 @@ class Stationary(Kern):
         assert self.variance.size==1
         self.add_parameters(self.variance, self.lengthscale)
 
+    def K_of_r(self, r):
+        raise NotImplementedError, "implement the covaraiance functino and a fn of r to use this class"
+
+    def K(self, X, X2=None):
+        r = self._scaled_dist(X, X2)
+        return self.K_of_r(r)
+
     def _dist(self, X, X2):
         if X2 is None:
             X2 = X
@@ -50,11 +57,11 @@ class Stationary(Kern):
         self.lengthscale.gradient = 0.
 
     def update_gradients_full(self, dL_dK, X, X2=None):
-        K = self.K(X, X2)
-        self.variance.gradient = np.sum(K * dL_dK)/self.variance
+        r = self._scaled_dist(X, X2)
+        K = self.K_of_r(r)
 
         rinv = self._inv_dist(X, X2)
-        dL_dr = self.dK_dr(X, X2) * dL_dK
+        dL_dr = self.dK_dr(r) * dL_dK
         x_xl3 = np.square(self._dist(X, X2)) / self.lengthscale**3
 
         if self.ARD:
@@ -62,6 +69,8 @@ class Stationary(Kern):
         else:
             self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum()
 
+        self.variance.gradient = np.sum(K * dL_dK)/self.variance
+
     def _inv_dist(self, X, X2=None):
         dist = self._scaled_dist(X, X2)
         if X2 is None:
@@ -72,7 +81,8 @@ class Stationary(Kern):
             return 1./np.where(dist != 0., dist, np.inf)
 
     def gradients_X(self, dL_dK, X, X2=None):
-        dL_dr = self.dK_dr(X, X2) * dL_dK
+        r = self._scaled_dist(X, X2)
+        dL_dr = self.dK_dr(r) * dL_dK
         invdist = self._inv_dist(X, X2)
         ret = np.sum((invdist*dL_dr)[:,:,None]*self._dist(X, X2),1)/self.lengthscale**2
         if X2 is None:
@@ -82,19 +92,65 @@ class Stationary(Kern):
     def gradients_X_diag(self, dL_dKdiag, X):
         return np.zeros(X.shape)
 
+    def add(self, other, tensor=False):
+        if not tensor:
+            return StatAdd(self, other)
+        else:
+            return super(Stationary, self).add(other, tensor)
+
+    def prod(self, other, tensor=False):
+        if not tensor:
+            return StatProd(self, other)
+        else:
+            return super(Stationary, self).prod(other, tensor)
 
 
 
+class StatAdd(Stationary):
+    """
+    Addition of two Stationary kernels on the same space is still stationary.
+
+    If you need to add two (stationary) kernels on separate spaces, use the generic add class.
+    """
+    def __init__(self, k1, k2):
+        assert isinstance(k1, Stationary)
+        assert isinstance(k2, Stationary)
+        self.k1, self.k2 = k1, k2
+        self.add_parameters(k1, k2)
+
+    def K_of_r(self, r):
+        return self.k1.K(r) + self.k2.K(r)
+
+    def dK_dr(self, r):
+        return self.k1.dK_dr + self.k2.dK_dr(r)
+
+class StatProd(Stationary):
+    """
+    Product of two Stationary kernels on the same space is still stationary.
+
+    If you need to multiply two (stationary) kernels on separate spaces, use the generic Prod class.
+    """
+    def __init__(self, k1, k2):
+        assert isinstance(k1, Stationary)
+        assert isinstance(k2, Stationary)
+        self.k1, self.k2 = k1, k2
+        self.add_parameters(k1, k2)
+
+    def K_of_r(self, r):
+        return self.k1.K(r) * self.k2.K(r)
+
+    def dK_dr(self, r):
+        return self.k1.dK_dr(r) * self.k2.K_of_r(r) + self.k2.dK_dr(r) * self.k1.K_of_r(r)
+
 class Exponential(Stationary):
     def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, name='Exponential'):
         super(Exponential, self).__init__(input_dim, variance, lengthscale, ARD, name)
 
     def K(self, X, X2=None):
-        dist = self._scaled_dist(X, X2)
         return self.variance * np.exp(-0.5 * dist)
 
-    def dK_dr(self, X, X2):
-        return -0.5*self.K(X, X2)
+    def dK_dr(self, r):
+        return -0.5*self.K_of_r(r)
 
 class Matern32(Stationary):
     """
@@ -109,13 +165,11 @@ class Matern32(Stationary):
     def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, name='Mat32'):
         super(Matern32, self).__init__(input_dim, variance, lengthscale, ARD, name)
 
-    def K(self, X, X2=None):
-        dist = self._scaled_dist(X, X2)
-        return self.variance * (1. + np.sqrt(3.) * dist) * np.exp(-np.sqrt(3.) * dist)
+    def K_of_r(self, r):
+        return self.variance * (1. + np.sqrt(3.) * r) * np.exp(-np.sqrt(3.) * r)
 
-    def dK_dr(self, X, X2):
-        dist = self._scaled_dist(X, X2)
-        return -3.*self.variance*dist*np.exp(-np.sqrt(3.)*dist)
+    def dK_dr(self,r):
+        return -3.*self.variance*r*np.exp(-np.sqrt(3.)*r)
 
     def Gram_matrix(self, F, F1, F2, lower, upper):
         """
@@ -153,12 +207,10 @@ class Matern52(Stationary):
        k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
        """
 
-    def K(self, X, X2=None):
-        r = self._scaled_dist(X, X2)
+    def K_of_r(self, r):
         return self.variance*(1+np.sqrt(5.)*r+5./3*r**2)*np.exp(-np.sqrt(5.)*r)
 
-    def dK_dr(self, X, X2):
-        r = self._scaled_dist(X, X2)
+    def dK_dr(self, r):
         return self.variance*(10./3*r -5.*r -5.*np.sqrt(5.)/3*r**2)*np.exp(-np.sqrt(5.)*r)
 
     def Gram_matrix(self,F,F1,F2,F3,lower,upper):
@@ -197,24 +249,20 @@ class ExpQuad(Stationary):
     def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, name='ExpQuad'):
         super(ExpQuad, self).__init__(input_dim, variance, lengthscale, ARD, name)
 
-    def K(self, X, X2=None):
-        r = self._scaled_dist(X, X2)
+    def K_of_r(self, r):
         return self.variance * np.exp(-0.5 * r**2)
 
-    def dK_dr(self, X, X2):
-        dist = self._scaled_dist(X, X2)
-        return -dist*self.K(X, X2)
+    def dK_dr(self, r):
+        return -r*self.K_of_r(r)
 
 class Cosine(Stationary):
     def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, name='Cosine'):
         super(Cosine, self).__init__(input_dim, variance, lengthscale, ARD, name)
 
-    def K(self, X, X2=None):
-        r = self._scaled_dist(X, X2)
+    def K_of_r(self, r)
         return self.variance * np.cos(r)
 
-    def dK_dr(self, X, X2):
-        r = self._scaled_dist(X, X2)
+    def dK_dr(self, r):
         return -self.variance * np.sin(r)
 
 
@@ -234,12 +282,10 @@ class RatQuad(Stationary):
         self.power = Param('power', power, Logexp())
         self.add_parameters(self.power)
 
-    def K(self, X, X2=None):
-        r = self._scaled_dist(X, X2)
+    def K_of_r(self, r)
         return self.variance*(1. + r**2/2.)**(-self.power)
 
-    def dK_dr(self, X, X2):
-        r = self._scaled_dist(X, X2)
+    def dK_dr(self, r):
         return -self.variance*self.power*r*(1. + r**2/2)**(-self.power - 1.)
 
     def update_gradients_full(self, dL_dK, X, X2=None):