merge with upstream

2026-05-07 02:52:40 +02:00 · 2016-03-10 18:17:35 +00:00 · 2016-03-10 18:17:35 +00:00 · ba74e29aee
commit ba74e29aee
parent e9cc56e8e8 2e7ad7b8d4
115 changed files with 1178 additions and 531 deletions
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -10,7 +10,7 @@ from .src.add import Add
 from .src.prod import Prod
 from .src.rbf import RBF
 from .src.linear import Linear, LinearFull
-from .src.static import Bias, White, Fixed
+from .src.static import Bias, White, Fixed, WhiteHeteroscedastic
 from .src.brownian import Brownian
 from .src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from .src.mlp import MLP
@ -28,4 +28,4 @@ from .src.trunclinear import TruncLinear,TruncLinear_inf
 from .src.splitKern import SplitKern,DEtime
 from .src.splitKern import DEtime as DiffGenomeKern
 from .src.spline import Spline
-from .src.basis_funcs import LogisticBasisFuncKernel, LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
+from .src.basis_funcs import LogisticBasisFuncKernel, LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
--- a/GPy/kern/src/ODE_t.py
+++ b/GPy/kern/src/ODE_t.py
@ -162,4 +162,4 @@ class ODE_t(Kern):

                self.lengthscale_Yt.gradient =  np.sum(dkYdlent*(-0.5*self.lengthscale_Yt**(-2)) * dL_dK)

-                self.ubias.gradient = np.sum(dkdubias * dL_dK) 
+                self.ubias.gradient = np.sum(dkdubias * dL_dK) 
--- a/GPy/kern/src/init.py
+++ b/GPy/kern/src/init.py
@ -1 +1 @@
-from . import psi_comp
+from . import psi_comp
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@ -19,8 +19,8 @@ class Add(CombinationKernel):
            if isinstance(kern, Add):
                del subkerns[i]
                for part in kern.parts[::-1]:
-                    kern.unlink_parameter(part)
-                    subkerns.insert(i, part)
+                    #kern.unlink_parameter(part)
+                    subkerns.insert(i, part.copy())
        super(Add, self).__init__(subkerns, name)
        self._exact_psicomp = self._check_exact_psicomp()

@ -37,7 +37,7 @@ class Add(CombinationKernel):
        else:
            return False

-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def K(self, X, X2=None, which_parts=None):
        """
        Add all kernels together.
@ -51,7 +51,7 @@ class Add(CombinationKernel):
            which_parts = [which_parts]
        return reduce(np.add, (p.K(X, X2) for p in which_parts))

-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def Kdiag(self, X, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
@ -98,17 +98,17 @@ class Add(CombinationKernel):
        [target.__iadd__(p.gradients_XX_diag(dL_dKdiag, X)) for p in self.parts]
        return target

-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi0(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi0(self,Z,variational_posterior)
        return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))

-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi1(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi1(self,Z,variational_posterior)
        return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))

-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi2(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi2(self,Z,variational_posterior)
        psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
@ -144,7 +144,7 @@ class Add(CombinationKernel):
                raise NotImplementedError("psi2 cannot be computed for this kernel")
        return psi2

-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi2n(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi2n(self, Z, variational_posterior)
        psi2 = reduce(np.add, (p.psi2n(Z, variational_posterior) for p in self.parts))
@ -241,16 +241,20 @@ class Add(CombinationKernel):
            [np.add(target_grads[i],grads[i],target_grads[i]) for i in range(len(grads))]
        return target_grads

-    def add(self, other):
-        if isinstance(other, Add):
-            other_params = other.parameters[:]
-            for p in other_params:
-                other.unlink_parameter(p)
-            self.link_parameters(*other_params)
-        else:
-            self.link_parameter(other)
-        self.input_dim, self._all_dims_active = self.get_input_dim_active_dims(self.parts)
-        return self
+    #def add(self, other):
+    #    parts = self.parts
+    #    if 0:#isinstance(other, Add):
+    #        #other_params = other.parameters[:]
+    #        for p in other.parts[:]:
+    #            other.unlink_parameter(p)
+    #        parts.extend(other.parts)
+    #        #self.link_parameters(*other_params)
+    #        
+    #    else:
+    #        #self.link_parameter(other)
+    #        parts.append(other)
+    #    #self.input_dim, self._all_dims_active = self.get_input_dim_active_dims(parts)
+    #    return Add([p for p in parts], self.name)

    def input_sensitivity(self, summarize=True):
        if summarize:
--- a/GPy/kern/src/eq_ode2.py
+++ b/GPy/kern/src/eq_ode2.py
@ -64,7 +64,7 @@ class EQ_ODE2(Kern):
        self.W = Param('W', W)
        self.link_parameters(self.lengthscale, self.C, self.B, self.W)

-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def K(self, X, X2=None):
        #This way is not working, indexes are lost after using k._slice_X
        #index = np.asarray(X, dtype=np.int)
--- a/GPy/kern/src/kern.py
+++ b/GPy/kern/src/kern.py
@ -48,11 +48,12 @@ class Kern(Parameterized):

        if active_dims is None:
            active_dims = np.arange(input_dim)
-
-        self.active_dims = active_dims
-        self._all_dims_active = np.atleast_1d(active_dims).astype(int)
-
-        assert self._all_dims_active.size == self.input_dim, "input_dim={} does not match len(active_dim)={}, _all_dims_active={}".format(self.input_dim, self._all_dims_active.size, self._all_dims_active)
+        
+        self.active_dims = np.asarray(active_dims, np.int_)
+        
+        self._all_dims_active = np.atleast_1d(self.active_dims).astype(int)
+        
+        assert self.active_dims.size == self.input_dim, "input_dim={} does not match len(active_dim)={}".format(self.input_dim, self._all_dims_active.size)

        self._sliced_X = 0
        self.useGPU = self._support_GPU and useGPU
@ -68,9 +69,12 @@ class Kern(Parameterized):
    def _effective_input_dim(self):
        return np.size(self._all_dims_active)

-    @Cache_this(limit=20)
+    @Cache_this(limit=3)
    def _slice_X(self, X):
-        return X[:, self._all_dims_active]
+        try:
+            return X[:, self._all_dims_active].astype('float')
+        except:
+            return X[:, self._all_dims_active]

    def K(self, X, X2):
        """
@ -319,10 +323,20 @@ class CombinationKernel(Kern):
        :param array-like extra_dims: if needed extra dimensions for the combination kernel to work on
        """
        assert all([isinstance(k, Kern) for k in kernels])
-        extra_dims = np.array(extra_dims, dtype=int)
-        input_dim, active_dims = self.get_input_dim_active_dims(kernels, extra_dims)
+        extra_dims = np.asarray(extra_dims, dtype=int)
+        
+        active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
+        
+        input_dim = active_dims.size
+        if extra_dims is not None:
+            input_dim += extra_dims.size
+
        # initialize the kernel with the full input_dim
        super(CombinationKernel, self).__init__(input_dim, active_dims, name)
+
+        effective_input_dim = reduce(max, (k._all_dims_active.max() for k in kernels)) + 1
+        self._all_dims_active = np.array(np.concatenate((np.arange(effective_input_dim), extra_dims if extra_dims is not None else [])), dtype=int)
+        
        self.extra_dims = extra_dims
        self.link_parameters(*kernels)

@ -330,16 +344,8 @@ class CombinationKernel(Kern):
    def parts(self):
        return self.parameters

-    def get_input_dim_active_dims(self, kernels, extra_dims = None):
-        self.active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
-        #_all_dims_active = np.array(np.concatenate((_all_dims_active, extra_dims if extra_dims is not None else [])), dtype=int)
-        input_dim = reduce(max, (k._all_dims_active.max() for k in kernels)) + 1
-
-        if extra_dims is not None:
-            input_dim += extra_dims.size
-
-        _all_dims_active = np.arange(input_dim)
-        return input_dim, _all_dims_active
+    def _set_all_dims_ative(self):
+        self._all_dims_active = np.atleast_1d(self.active_dims).astype(int)        

    def input_sensitivity(self, summarize=True):
        """
--- a/GPy/kern/src/linear.py
+++ b/GPy/kern/src/linear.py
@ -51,7 +51,7 @@ class Linear(Kern):
        self.link_parameter(self.variances)
        self.psicomp = PSICOMP_Linear()

-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def K(self, X, X2=None):
        if self.ARD:
            if X2 is None:
@ -62,7 +62,7 @@ class Linear(Kern):
        else:
            return self._dot_product(X, X2) * self.variances

-    @Cache_this(limit=1, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def _dot_product(self, X, X2=None):
        if X2 is None:
            return tdot(X)
--- a/GPy/kern/src/mlp.py
+++ b/GPy/kern/src/mlp.py
@ -45,7 +45,7 @@ class MLP(Kern):
        self.link_parameters(self.variance, self.weight_variance, self.bias_variance)


-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def K(self, X, X2=None):
        if X2 is None:
            X_denom = np.sqrt(self._comp_prod(X)+1.)
@ -57,7 +57,7 @@ class MLP(Kern):
        XTX = self._comp_prod(X,X2)/X_denom[:,None]/X2_denom[None,:]
        return self.variance*four_over_tau*np.arcsin(XTX)

-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix for X."""
        X_prod = self._comp_prod(X)
@ -88,14 +88,14 @@ class MLP(Kern):
        """Gradient of diagonal of covariance with respect to X"""
        return self._comp_grads_diag(dL_dKdiag, X)[3]

-    @Cache_this(limit=50, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def _comp_prod(self, X, X2=None):
        if X2 is None:
            return (np.square(X)*self.weight_variance).sum(axis=1)+self.bias_variance
        else:
            return (X*self.weight_variance).dot(X2.T)+self.bias_variance
    
-    @Cache_this(limit=20, ignore_args=(1,))
+    @Cache_this(limit=3, ignore_args=(1,))
    def _comp_grads(self, dL_dK, X, X2=None):
        var,w,b = self.variance, self.weight_variance, self.bias_variance
        K = self.K(X, X2)
@ -130,7 +130,7 @@ class MLP(Kern):
            dX2 = common.T.dot(X)*w-((common*XTX).sum(axis=0)/(X2_prod+1.))[:,None]*X2*w
        return dvar, dw, db, dX, dX2
    
-    @Cache_this(limit=20, ignore_args=(1,))
+    @Cache_this(limit=3, ignore_args=(1,))
    def _comp_grads_diag(self, dL_dKdiag, X):
        var,w,b = self.variance, self.weight_variance, self.bias_variance
        K = self.Kdiag(X)
--- a/GPy/kern/src/poly.py
+++ b/GPy/kern/src/poly.py
@ -5,32 +5,49 @@ import numpy as np
 from .kern import Kern
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
+from paramz.caching import Cache_this

 class Poly(Kern):
    """
    Polynomial kernel
    """

-    def __init__(self, input_dim, variance=1., order=3., active_dims=None, name='poly'):
+    def __init__(self, input_dim, variance=1., scale=1., bias=1., order=3., active_dims=None, name='poly'):
        super(Poly, self).__init__(input_dim, active_dims, name)
        self.variance = Param('variance', variance, Logexp())
-        self.link_parameter(self.variance)
+        self.scale = Param('scale', scale, Logexp())
+        self.bias = Param('bias', bias, Logexp())
+
+        self.link_parameters(self.variance, self.scale, self.bias)
+        assert order >= 1, 'The order of the polynomial has to be at least 1.'
        self.order=order

-    def K(self, X, X2=None):
-        return (self._dot_product(X, X2) + 1.)**self.order * self.variance

-    def _dot_product(self, X, X2=None):
+    def K(self, X, X2=None):
+        _, _, B = self._AB(X, X2)
+        return B * self.variance
+
+    @Cache_this(limit=3)
+    def _AB(self, X, X2=None):
        if X2 is None:
-            return np.dot(X, X.T)
+            dot_prod = np.dot(X, X.T)
        else:
-            return np.dot(X, X2.T)
+            dot_prod = np.dot(X, X2.T)
+        A = (self.scale * dot_prod) + self.bias
+        B = A ** self.order
+        return dot_prod, A, B

    def Kdiag(self, X):
-        return self.variance*(np.square(X).sum(1) + 1.)**self.order
+        return self.K(X).diagonal()#self.variance*(np.square(X).sum(1) + 1.)**self.order

    def update_gradients_full(self, dL_dK, X, X2=None):
-        self.variance.gradient = np.sum(dL_dK * (self._dot_product(X, X2) + 1.)**self.order)
+        dot_prod, A, B = self._AB(X, X2)
+        dK_dA = self.variance * self.order * A ** (self.order-1.)
+        dL_dA = dL_dK * (dK_dA)
+        self.scale.gradient = (dL_dA * dot_prod).sum()
+        self.bias.gradient = dL_dA.sum()
+        self.variance.gradient = np.sum(dL_dK * B)
+        #import ipdb;ipdb.set_trace()

    def update_gradients_diag(self, dL_dKdiag, X):
        raise NotImplementedError
--- a/GPy/kern/src/prod.py
+++ b/GPy/kern/src/prod.py
@ -39,7 +39,7 @@ class Prod(CombinationKernel):
                    kernels.insert(i, part)
        super(Prod, self).__init__(kernels, name)

-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def K(self, X, X2=None, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
@ -48,7 +48,7 @@ class Prod(CombinationKernel):
            which_parts = [which_parts]
        return reduce(np.multiply, (p.K(X, X2) for p in which_parts))

-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def Kdiag(self, X, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
--- a/GPy/kern/src/psi_comp/init.py
+++ b/GPy/kern/src/psi_comp/init.py
@ -21,7 +21,7 @@ from .gaussherm import PSICOMP_GH
 from . import rbf_psi_comp, linear_psi_comp, ssrbf_psi_comp, sslinear_psi_comp

 class PSICOMP_RBF(PSICOMP):
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        variance, lengthscale = kern.variance, kern.lengthscale
        if isinstance(variational_posterior, variational.NormalPosterior):
@ -31,7 +31,7 @@ class PSICOMP_RBF(PSICOMP):
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")

-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variance, lengthscale = kern.variance, kern.lengthscale
        if isinstance(variational_posterior, variational.NormalPosterior):
@ -43,7 +43,7 @@ class PSICOMP_RBF(PSICOMP):

 class PSICOMP_Linear(PSICOMP):

-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        variances = kern.variances
        if isinstance(variational_posterior, variational.NormalPosterior):
@ -53,7 +53,7 @@ class PSICOMP_Linear(PSICOMP):
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")

-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variances = kern.variances
        if isinstance(variational_posterior, variational.NormalPosterior):
--- a/GPy/kern/src/psi_comp/gaussherm.py
+++ b/GPy/kern/src/psi_comp/gaussherm.py
@ -27,7 +27,7 @@ class PSICOMP_GH(PSICOMP):
    def _setup_observers(self):
        pass
    
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def comp_K(self, Z, qX):
        if self.Xs is None or self.Xs.shape != qX.mean.shape:
            from paramz import ObsAr
@ -38,7 +38,7 @@ class PSICOMP_GH(PSICOMP):
            self.Xs[i] = self.locs[i]*S_sq+mu
        return self.Xs
    
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, qX, return_psi2_n=False):
        mu, S = qX.mean.values, qX.variance.values
        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
@ -62,7 +62,7 @@ class PSICOMP_GH(PSICOMP):
                psi2 += self.weights[i]* tdot(Kfu.T)
        return psi0, psi1, psi2
    
-    @Cache_this(limit=10, ignore_args=(0, 2,3,4))
+    @Cache_this(limit=3, ignore_args=(0, 2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, qX):
        mu, S = qX.mean.values, qX.variance.values
        if self.cache_K: Xs = self.comp_K(Z, qX)
--- a/GPy/kern/src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/src/psi_comp/rbf_psi_comp.py
@ -132,5 +132,5 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):

    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS

-_psi1computations = Cacher(__psi1computations, limit=5)
-_psi2computations = Cacher(__psi2computations, limit=5)
+_psi1computations = Cacher(__psi1computations, limit=3)
+_psi2computations = Cacher(__psi2computations, limit=3)
--- a/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
@ -324,7 +324,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        except:
            return self.fall_back.psicomputations(kern, Z, variational_posterior, return_psi2_n)

-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def _psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        """
        Z - MxQ
@ -369,7 +369,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        except:
            return self.fall_back.psiDerivativecomputations(kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)

-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def _psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        # resolve the requirement of dL_dpsi2 to be symmetric
        if len(dL_dpsi2.shape)==2: dL_dpsi2 = (dL_dpsi2+dL_dpsi2.T)/2
--- a/GPy/kern/src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/src/psi_comp/ssrbf_psi_comp.py
@ -88,7 +88,7 @@ try:
        return psi0,psi1,psi2,psi2n

    from GPy.util.caching import Cacher
-    psicomputations = Cacher(_psicomputations, limit=1)
+    psicomputations = Cacher(_psicomputations, limit=3)

    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        ARD = (len(lengthscale)!=1)
--- a/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
@ -373,7 +373,7 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
    def get_dimensions(self, Z, variational_posterior):
        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]

-    @Cache_this(limit=1, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        """
        Z - MxQ
@ -407,7 +407,7 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
        else:
            return psi0, psi1_gpu.get(), psi2_gpu.get()

-    @Cache_this(limit=1, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variance, lengthscale = kern.variance, kern.lengthscale
        from ....util.linalg_gpu import sum_axis
--- a/GPy/kern/src/standard_periodic.py
+++ b/GPy/kern/src/standard_periodic.py
@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
-
-# Copyright (c) 2014, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015, Alex Grigorevskiy
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 The standard periodic kernel which mentioned in:
@ -9,7 +8,7 @@ The standard periodic kernel which mentioned in:
 The MIT Press, 2005.


-[2] Introduction to Gaussian processes. D. J. C. MacKay. In C. M. Bishop, editor, 
+[2] Introduction to Gaussian processes. D. J. C. MacKay. In C. M. Bishop, editor,
 Neural Networks and Machine Learning, pages 133-165. Springer, 1998.
 """

@ -25,56 +24,56 @@ class StdPeriodic(Kern):

    .. math::

-       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
-       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
+       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} \sum_{i=1}^{input\_dim}
+       \left( \frac{\sin(\frac{\pi}{T_i} (x_i - y_i) )}{l_i} \right)^2 \right] }

    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance: the variance :math:`\theta_1` in the formula above
    :type variance: float
-    :param wavelength: the vector of wavelengths :math:`\lambda_i`. If None then 1.0 is assumed.
-    :type wavelength: array or list of the appropriate size (or float if there is only one wavelength parameter)
+    :param period: the vector of periods :math:`\T_i`. If None then 1.0 is assumed.
+    :type period: array or list of the appropriate size (or float if there is only one period parameter)
    :param lengthscale: the vector of lengthscale :math:`\l_i`. If None then 1.0 is assumed.
    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
-    :param ARD1: Auto Relevance Determination with respect to wavelength. 
-        If equal to "False" one single wavelength parameter :math:`\lambda_i` for 
-        each dimension is assumed, otherwise there is one lengthscale 
+    :param ARD1: Auto Relevance Determination with respect to period.
+        If equal to "False" one single period parameter :math:`\T_i` for
+        each dimension is assumed, otherwise there is one lengthscale
        parameter per dimension.
    :type ARD1: Boolean
-    :param ARD2: Auto Relevance Determination with respect to lengthscale. 
-        If equal to "False" one single wavelength parameter :math:`l_i` for 
-        each dimension is assumed, otherwise there is one lengthscale 
+    :param ARD2: Auto Relevance Determination with respect to lengthscale.
+        If equal to "False" one single lengthscale parameter :math:`l_i` for
+        each dimension is assumed, otherwise there is one lengthscale
        parameter per dimension.
    :type ARD2: Boolean
    :param active_dims: indices of dimensions which are used in the computation of the kernel
-    :type wavelength: array or list of the appropriate size
+    :type active_dims: array or list of the appropriate size
    :param name: Name of the kernel for output
    :type String
    :param useGPU: whether of not use GPU
    :type Boolean
    """
-    
-    def __init__(self, input_dim, variance=1., wavelength=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
+
+    def __init__(self, input_dim, variance=1., period=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
        super(StdPeriodic, self).__init__(input_dim, active_dims, name, useGPU=useGPU)
        self.input_dim = input_dim
-        self.ARD1 = ARD1 # correspond to wavelengths        
+        self.ARD1 = ARD1 # correspond to periods
        self.ARD2 = ARD2 # correspond to lengthscales
-        
+
        self.name = name
-        
+
        if self.ARD1 == False:
-            if wavelength is not None:
-                wavelength = np.asarray(wavelength)
-                assert wavelength.size == 1, "Only one wavelength needed for non-ARD kernel"
+            if period is not None:
+                period = np.asarray(period)
+                assert period.size == 1, "Only one period needed for non-ARD kernel"
            else:
-                wavelength = np.ones(1)
+                period = np.ones(1.0)
        else:
-            if wavelength is not None:
-                wavelength = np.asarray(wavelength)
-                assert wavelength.size == input_dim, "bad number of wavelengths"
+            if period is not None:
+                period = np.asarray(period)
+                assert period.size == input_dim, "bad number of periods"
            else:
-                wavelength = np.ones(input_dim)
-        
+                period = np.ones(input_dim)
+
        if self.ARD2 == False:
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
@ -87,33 +86,33 @@ class StdPeriodic(Kern):
                assert lengthscale.size == input_dim, "bad number of lengthscales"
            else:
                lengthscale = np.ones(input_dim)
-        
+
        self.variance = Param('variance', variance, Logexp())
        assert self.variance.size==1, "Variance size must be one"
-        self.wavelengths =  Param('wavelengths', wavelength, Logexp())
-        self.lengthscales =  Param('lengthscales', lengthscale, Logexp())
-        
-        self.link_parameters(self.variance,  self.wavelengths, self.lengthscales)
+        self.period =  Param('period', period, Logexp())
+        self.lengthscale =  Param('lengthscale', lengthscale, Logexp())
+
+        self.link_parameters(self.variance,  self.period, self.lengthscale)

    def parameters_changed(self):
        """
-        This functions deals as a callback for each optimization iteration. 
+        This functions deals as a callback for each optimization iteration.
        If one optimization step was successfull and the parameters
-        this callback function will be called to be able to update any 
+        this callback function will be called to be able to update any
        precomputations for the kernel.
        """
-        
+
        pass
-        
-        
+
+
    def K(self, X, X2=None):
        """Compute the covariance matrix between X and X2."""
-        if X2 is None: 
+        if X2 is None:
            X2 = X
-            
-        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
-        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscales ), axis = -1 ) ) 
-            
+
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.period
+        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscale ), axis = -1 ) )
+
        return self.variance * exp_dist


@ -125,42 +124,42 @@ class StdPeriodic(Kern):

    def update_gradients_full(self, dL_dK, X, X2=None):
        """derivative of the covariance matrix with respect to the parameters."""
-        if X2 is None: 
+        if X2 is None:
            X2 = X
-        
-        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
-        
-        sin_base = np.sin( base )         
-        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscales ), axis = -1 ) ) 
-        
-        dwl = self.variance * (1.0/np.square(self.lengthscales)) * sin_base*np.cos(base) * (base / self.wavelengths)
-        
-        dl = self.variance * np.square( sin_base) / np.power( self.lengthscales, 3) 
-        
-        self.variance.gradient = np.sum(exp_dist * dL_dK)    
-        #target[0] += np.sum( exp_dist * dL_dK)        
-        
-        if self.ARD1: # different wavelengths
-            self.wavelengths.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
-        else:  # same wavelengths
-            self.wavelengths.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
-            
+
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.period
+
+        sin_base = np.sin( base )
+        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscale ), axis = -1 ) )
+
+        dwl = self.variance * (1.0/np.square(self.lengthscale)) * sin_base*np.cos(base) * (base / self.period)
+
+        dl = self.variance * np.square( sin_base) / np.power( self.lengthscale, 3)
+
+        self.variance.gradient = np.sum(exp_dist * dL_dK)
+        #target[0] += np.sum( exp_dist * dL_dK)
+
+        if self.ARD1: # different periods
+            self.period.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+        else:  # same period
+            self.period.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
+
        if self.ARD2: # different lengthscales
-            self.lengthscales.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+            self.lengthscale.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
        else: # same lengthscales
-            self.lengthscales.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
-        
+            self.lengthscale.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
+
    def update_gradients_diag(self, dL_dKdiag, X):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        self.variance.gradient = np.sum(dL_dKdiag)
-        self.wavelengths.gradient = 0
-        self.lengthscales.gradient = 0
+        self.period.gradient = 0
+        self.lengthscale.gradient = 0

 #    def gradients_X(self, dL_dK, X, X2=None):
 #        """derivative of the covariance matrix with respect to X."""
-#    
+#
 #        raise NotImplemented("Periodic kernel: dK_dX not implemented")
 #
 #    def gradients_X_diag(self, dL_dKdiag, X):
-#        
+#
 #        raise NotImplemented("Periodic kernel: dKdiag_dX not implemented")
--- a/GPy/kern/src/static.py
+++ b/GPy/kern/src/static.py
@ -81,6 +81,52 @@ class White(Static):
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum()

+class WhiteHeteroscedastic(Static):
+    def __init__(self, input_dim, num_data, variance=1., active_dims=None, name='white_hetero'):
+        """
+        A heteroscedastic White kernel (nugget/noise).
+        It defines one variance (nugget) per input sample. 
+        
+        Prediction excludes any noise learnt by this Kernel, so be careful using this kernel.
+        
+        You can plot the errors learnt by this kernel by something similar as:
+        plt.errorbar(m.X, m.Y, yerr=2*np.sqrt(m.kern.white.variance))
+        """
+        super(Static, self).__init__(input_dim, active_dims, name)
+        self.variance = Param('variance', np.ones(num_data) * variance, Logexp())
+        self.link_parameters(self.variance)
+
+    def Kdiag(self, X):
+        if X.shape[0] == self.variance.shape[0]:
+            # If the input has the same number of samples as 
+            # the number of variances, we return the variances
+            return self.variance
+        return 0.
+
+    def K(self, X, X2=None):
+        if X2 is None and X.shape[0] == self.variance.shape[0]:
+            return np.eye(X.shape[0]) * self.variance
+        else:
+            return 0.
+
+    def psi2(self, Z, variational_posterior):
+        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
+
+    def psi2n(self, Z, variational_posterior):
+        return np.zeros((1, Z.shape[0], Z.shape[0]), dtype=np.float64)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None:
+            self.variance.gradient = np.diagonal(dL_dK)
+        else:
+            self.variance.gradient = 0.
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.variance.gradient = dL_dKdiag
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        self.variance.gradient = dL_dpsi0
+
 class Bias(Static):
    def __init__(self, input_dim, variance=1., active_dims=None, name='bias'):
        super(Bias, self).__init__(input_dim, variance, active_dims, name)
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@ -81,11 +81,11 @@ class Stationary(Kern):
    def dK_dr(self, r):
        raise NotImplementedError("implement derivative of the covariance function wrt r to use this class")

-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def dK2_drdr(self, r):
        raise NotImplementedError("implement second derivative of covariance wrt r to use this method")

-    @Cache_this(limit=5, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def K(self, X, X2=None):
        """
        Kernel function applied on inputs X and X2.
@ -99,6 +99,9 @@ class Stationary(Kern):

    @Cache_this(limit=3, ignore_args=())
    def dK_dr_via_X(self, X, X2):
+        """
+        compute the derivative of K wrt X going through X
+        """
        #a convenience function, so we can cache dK_dr
        return self.dK_dr(self._scaled_dist(X, X2))

--- a/GPy/kern/src/trunclinear.py
+++ b/GPy/kern/src/trunclinear.py
@ -54,12 +54,12 @@ class TruncLinear(Kern):
        self.add_parameter(self.variances)
        self.add_parameter(self.delta)

-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def K(self, X, X2=None):
        XX = self.variances*self._product(X, X2)
        return XX.sum(axis=-1)

-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def _product(self, X, X2=None):
        if X2 is None:
            X2 = X
@ -149,12 +149,12 @@ class TruncLinear_inf(Kern):
        self.add_parameter(self.variances)


-#     @Cache_this(limit=2)
+#     @Cache_this(limit=3)
    def K(self, X, X2=None):
        tmp = self._product(X, X2)
        return (self.variances*tmp).sum(axis=-1)

-#     @Cache_this(limit=2)
+#     @Cache_this(limit=3)
    def _product(self, X, X2=None):
        if X2 is None:
            X2 = X