From ba4bd50924aa9d3bcbdf2e644094cff8ac290cb9 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 16 May 2013 15:17:54 +0100
Subject: [PATCH] stability enhancing clipping in logexp_clipped and reverse of
 stability clipping of parameters

---
 GPy/core/transformations.py | 26 ++++++++-------
 GPy/kern/kern.py            | 65 ++++++++++++++++++-------------------
 2 files changed, 46 insertions(+), 45 deletions(-)

diff --git a/GPy/core/transformations.py b/GPy/core/transformations.py
index 1ad36f78..6e5741fe 100644
--- a/GPy/core/transformations.py
+++ b/GPy/core/transformations.py
@@ -39,23 +39,25 @@ class logexp(transformation):
         return '(+ve)'
 
 class logexp_clipped(transformation):
-    max_bound = 1e300
+    max_bound = 1e10
+    min_bound = 1e-10
     log_max_bound = np.log(max_bound)
-    def __init__(self, lower=1e-15):
+    log_min_bound = np.log(min_bound)
+    def __init__(self, lower=1e-6):
         self.domain = 'positive'
         self.lower = lower
     def f(self, x):
-        exp = np.exp(np.where(x > self.log_max_bound, self.log_max_bound, x))
+        exp = np.exp(np.clip(x, self.log_min_bound, self.log_max_bound))
         f = np.log(1. + exp)
         return f
     def finv(self, f):
-        return np.log(np.exp(f) - 1.)
+        return np.log(np.exp(np.clip(f, self.min_bound, self.max_bound)) - 1.)
     def gradfactor(self, f):
         ef = np.exp(f)
         gf = (ef - 1.) / ef
         return np.where(f < self.lower, 0, gf)
-    def initialize(self,f):
-        if np.any(f<0.):
+    def initialize(self, f):
+        if np.any(f < 0.):
             print "Warning: changing parameters to satisfy constraints"
         return np.abs(f)
     def __str__(self):
@@ -71,7 +73,7 @@ class exponent(transformation):
     def gradfactor(self, f):
         return f
     def initialize(self, f):
-        if np.any(f<0.):
+        if np.any(f < 0.):
             print "Warning: changing parameters to satisfy constraints"
         return np.abs(f)
     def __str__(self):
@@ -87,7 +89,7 @@ class negative_exponent(transformation):
     def gradfactor(self, f):
         return f
     def initialize(self, f):
-        if np.any(f>0.):
+        if np.any(f > 0.):
             print "Warning: changing parameters to satisfy constraints"
         return -np.abs(f)
     def __str__(self):
@@ -118,11 +120,11 @@ class logistic(transformation):
     def finv(self, f):
         return np.log(np.clip(f - self.lower, 1e-10, np.inf) / np.clip(self.upper - f, 1e-10, np.inf))
     def gradfactor(self, f):
-        return (f-self.lower)*(self.upper-f)/self.difference
-    def initialize(self,f):
-        if np.any(np.logical_or(f<self.lower,f>self.upper)):
+        return (f - self.lower) * (self.upper - f) / self.difference
+    def initialize(self, f):
+        if np.any(np.logical_or(f < self.lower, f > self.upper)):
             print "Warning: changing parameters to satisfy constraints"
-        return np.where(np.logical_or(f<self.lower,f>self.upper),self.f(f*0.),f)
+        return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f)
     def __str__(self):
         return '({},{})'.format(self.lower, self.upper)
 
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 13c17139..c682fdcc 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -66,7 +66,7 @@ class kern(parameterised):
 
     def _transform_gradients(self, g):
         x = self._get_params()
-        [np.put(x,i,x*t.gradfactor(x[i])) for i,t in zip(self.constrained_indices, self.constraints)]
+        [np.put(x, i, x * t.gradfactor(x[i])) for i, t in zip(self.constrained_indices, self.constraints)]
         [np.put(g, i, v) for i, v in [(t[0], np.sum(g[t])) for t in self.tied_indices]]
         if len(self.tied_indices) or len(self.fixed_indices):
             to_remove = np.hstack((self.fixed_indices + [t[1:] for t in self.tied_indices]))
@@ -88,7 +88,7 @@ class kern(parameterised):
         """
         return self.add(other)
 
-    def add(self, other,tensor=False):
+    def add(self, other, tensor=False):
         """
         Add another kernel to this one. Both kernels are defined on the same _space_
         :param other: the other kernel to be added
@@ -103,7 +103,7 @@ class kern(parameterised):
             newkern = kern(D, self.parts + other.parts, self_input_slices + other_input_slices)
 
             # transfer constraints:
-            newkern.constrained_indices = self.constrained_indices + [x+self.Nparam for x in other.constrained_indices]
+            newkern.constrained_indices = self.constrained_indices + [x + self.Nparam for x in other.constrained_indices]
             newkern.constraints = self.constraints + other.constraints
             newkern.fixed_indices = self.fixed_indices + [self.Nparam + x for x in other.fixed_indices]
             newkern.fixed_values = self.fixed_values + other.fixed_values
@@ -113,7 +113,7 @@ class kern(parameterised):
             assert self.D == other.D
             newkern = kern(self.D, self.parts + other.parts, self.input_slices + other.input_slices)
             # transfer constraints:
-            newkern.constrained_indices = self.constrained_indices + [i+self.Nparam  for i in other.constrained_indices]
+            newkern.constrained_indices = self.constrained_indices + [i + self.Nparam  for i in other.constrained_indices]
             newkern.constraints = self.constraints + other.constraints
             newkern.fixed_indices = self.fixed_indices + [self.Nparam + x for x in other.fixed_indices]
             newkern.fixed_values = self.fixed_values + other.fixed_values
@@ -126,7 +126,7 @@ class kern(parameterised):
         """
         return self.prod(other)
 
-    def prod(self, other,tensor=False):
+    def prod(self, other, tensor=False):
         """
         multiply two kernels (either on the same space, or on the tensor product of the input space)
         :param other: the other kernel to be added
@@ -136,12 +136,12 @@ class kern(parameterised):
         K2 = other.copy()
 
         slices = []
-        for sl1, sl2 in itertools.product(K1.input_slices,K2.input_slices):
-            s1, s2 = [False]*K1.D, [False]*K2.D
+        for sl1, sl2 in itertools.product(K1.input_slices, K2.input_slices):
+            s1, s2 = [False] * K1.D, [False] * K2.D
             s1[sl1], s2[sl2] = [True], [True]
-            slices += [s1+s2]
+            slices += [s1 + s2]
 
-        newkernparts = [prod(k1, k2,tensor) for k1, k2 in itertools.product(K1.parts, K2.parts)]
+        newkernparts = [prod(k1, k2, tensor) for k1, k2 in itertools.product(K1.parts, K2.parts)]
 
         if tensor:
             newkern = kern(K1.D + K2.D, newkernparts, slices)
@@ -189,14 +189,13 @@ class kern(parameterised):
             index = np.where(index_param == i)[0]
             if index.size > 1:
                 self.tie_params(index)
-        for i,t in zip(prev_constr_ind,prev_constr):
-            self.constrain(np.where(index_param == i)[0],t)
+        for i, t in zip(prev_constr_ind, prev_constr):
+            self.constrain(np.where(index_param == i)[0], t)
 
     def _get_params(self):
         return np.hstack([p._get_params() for p in self.parts])
 
     def _set_params(self, x):
-        x = np.clip(x, -1e300, 1e300)
         [p._set_params(x[s]) for p, s in zip(self.parts, self.param_slices)]
 
     def _get_param_names(self):
@@ -209,15 +208,15 @@ class kern(parameterised):
         return sum([[name + '_' + n for n in k._get_param_names()] for name, k in zip(names, self.parts)], [])
 
     def K(self, X, X2=None, which_parts='all'):
-        if which_parts=='all':
-            which_parts = [True]*self.Nparts
+        if which_parts == 'all':
+            which_parts = [True] * self.Nparts
         assert X.shape[1] == self.D
         if X2 is None:
             target = np.zeros((X.shape[0], X.shape[0]))
             [p.K(X[:, i_s], None, target=target) for p, i_s, part_i_used in zip(self.parts, self.input_slices, which_parts) if part_i_used]
         else:
             target = np.zeros((X.shape[0], X2.shape[0]))
-            [p.K(X[:, i_s], X2[:,i_s], target=target) for p, i_s, part_i_used in zip(self.parts, self.input_slices, which_parts) if part_i_used]
+            [p.K(X[:, i_s], X2[:, i_s], target=target) for p, i_s, part_i_used in zip(self.parts, self.input_slices, which_parts) if part_i_used]
         return target
 
     def dK_dtheta(self, dL_dK, X, X2=None):
@@ -249,8 +248,8 @@ class kern(parameterised):
         return target
 
     def Kdiag(self, X, which_parts='all'):
-        if which_parts=='all':
-            which_parts = [True]*self.Nparts
+        if which_parts == 'all':
+            which_parts = [True] * self.Nparts
         assert X.shape[1] == self.D
         target = np.zeros(X.shape[0])
         [p.Kdiag(X[:, i_s], target=target) for p, i_s, part_on in zip(self.parts, self.input_slices, which_parts) if part_on]
@@ -271,22 +270,22 @@ class kern(parameterised):
 
     def psi0(self, Z, mu, S):
         target = np.zeros(mu.shape[0])
-        [p.psi0(Z[:,i_s], mu[:,i_s], S[:,i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
+        [p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
         return target
 
     def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S):
         target = np.zeros(self.Nparam)
-        [p.dpsi0_dtheta(dL_dpsi0, Z[:,i_s], mu[:,i_s], S[:,i_s], target[ps]) for p, ps, i_s in zip(self.parts, self.param_slices, self.input_slices)]
+        [p.dpsi0_dtheta(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self.parts, self.param_slices, self.input_slices)]
         return self._transform_gradients(target)
 
     def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S):
         target_mu, target_S = np.zeros_like(mu), np.zeros_like(S)
-        [p.dpsi0_dmuS(dL_dpsi0, Z[:,i_s], mu[:,i_s], S[:,i_s], target_mu[:,i_s], target_S[:,i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+        [p.dpsi0_dmuS(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
         return target_mu, target_S
 
     def psi1(self, Z, mu, S):
         target = np.zeros((mu.shape[0], Z.shape[0]))
-        [p.psi1(Z[:,i_s], mu[:,i_s], S[:,i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
+        [p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
         return target
 
     def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S):
@@ -315,7 +314,7 @@ class kern(parameterised):
         [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
 
         # compute the "cross" terms
-        #TODO: input_slices needed
+        # TODO: input_slices needed
         for p1, p2 in itertools.combinations(self.parts, 2):
             # white doesn;t combine with anything
             if p1.name == 'white' or p2.name == 'white':
@@ -336,9 +335,9 @@ class kern(parameterised):
                 target += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
             # rbf X linear
             elif p1.name == 'linear' and p2.name == 'rbf':
-                raise NotImplementedError  # TODO
+                raise NotImplementedError # TODO
             elif p2.name == 'linear' and p1.name == 'rbf':
-                raise NotImplementedError  # TODO
+                raise NotImplementedError # TODO
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
         return target
@@ -366,7 +365,7 @@ class kern(parameterised):
                 p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1._psi1 * 2., Z, mu, S, target[ps2])
             # linear X bias
             elif p1.name == 'bias' and p2.name == 'linear':
-                p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1.variance * 2., Z, mu, S, target[ps2])  # [ps1])
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1.variance * 2., Z, mu, S, target[ps2]) # [ps1])
                 psi1 = np.zeros((mu.shape[0], Z.shape[0]))
                 p2.psi1(Z, mu, S, psi1)
                 p1.dpsi1_dtheta(dL_dpsi2.sum(1) * psi1 * 2., Z, mu, S, target[ps1])
@@ -377,9 +376,9 @@ class kern(parameterised):
                 p2.dpsi1_dtheta(dL_dpsi2.sum(1) * psi1 * 2., Z, mu, S, target[ps2])
             # rbf X linear
             elif p1.name == 'linear' and p2.name == 'rbf':
-                raise NotImplementedError  # TODO
+                raise NotImplementedError # TODO
             elif p2.name == 'linear' and p1.name == 'rbf':
-                raise NotImplementedError  # TODO
+                raise NotImplementedError # TODO
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
 
@@ -390,7 +389,7 @@ class kern(parameterised):
         [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
 
         # compute the "cross" terms
-        #TODO: we need input_slices here.
+        # TODO: we need input_slices here.
         for p1, p2 in itertools.combinations(self.parts, 2):
             # white doesn;t combine with anything
             if p1.name == 'white' or p2.name == 'white':
@@ -407,9 +406,9 @@ class kern(parameterised):
                 p1.dpsi1_dZ(dL_dpsi2.sum(1).T * p2.variance, Z, mu, S, target)
             # rbf X linear
             elif p1.name == 'linear' and p2.name == 'rbf':
-                raise NotImplementedError  # TODO
+                raise NotImplementedError # TODO
             elif p2.name == 'linear' and p1.name == 'rbf':
-                raise NotImplementedError  # TODO
+                raise NotImplementedError # TODO
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
 
@@ -420,7 +419,7 @@ class kern(parameterised):
         [p.dpsi2_dmuS(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
 
         # compute the "cross" terms
-        #TODO: we need input_slices here.
+        # TODO: we need input_slices here.
         for p1, p2 in itertools.combinations(self.parts, 2):
             # white doesn;t combine with anything
             if p1.name == 'white' or p2.name == 'white':
@@ -437,9 +436,9 @@ class kern(parameterised):
                 p1.dpsi1_dmuS(dL_dpsi2.sum(1).T * p2.variance * 2., Z, mu, S, target_mu, target_S)
             # rbf X linear
             elif p1.name == 'linear' and p2.name == 'rbf':
-                raise NotImplementedError  # TODO
+                raise NotImplementedError # TODO
             elif p2.name == 'linear' and p1.name == 'rbf':
-                raise NotImplementedError  # TODO
+                raise NotImplementedError # TODO
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"