diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py index eae717be..20d84949 100644 --- a/GPy/kern/__init__.py +++ b/GPy/kern/__init__.py @@ -2,5 +2,5 @@ # Licensed under the BSD 3-clause license (see LICENSE.txt) -from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, linear_ARD, rbf_sympy, sympykern, periodic_exponential, periodic_Matern32, periodic_Matern52 +from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, rbf_sympy, sympykern, periodic_exponential, periodic_Matern32, periodic_Matern52 from kern import kern diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py index 1bd241e7..b483c823 100644 --- a/GPy/kern/constructors.py +++ b/GPy/kern/constructors.py @@ -8,7 +8,6 @@ from kern import kern from rbf import rbf as rbfpart from white import white as whitepart from linear import linear as linearpart -from linear_ARD import linear_ARD as linear_ARD_part from exponential import exponential as exponentialpart from Matern32 import Matern32 as Matern32part from Matern52 import Matern52 as Matern52part @@ -40,28 +39,17 @@ def rbf(D,variance=1., lengthscale=None,ARD=False): part = rbfpart(D,variance,lengthscale,ARD) return kern(D, [part]) -def linear(D,lengthscales=None): +def linear(D,variances=None,ARD=True): """ Construct a linear kernel. Arguments --------- D (int), obligatory - lengthscales (np.ndarray) + variances (np.ndarray) + ARD (boolean) """ - part = linearpart(D,lengthscales) - return kern(D, [part]) - -def linear_ARD(D,lengthscales=None): - """ - Construct a linear ARD kernel. - - Arguments - --------- - D (int), obligatory - lengthscales (np.ndarray) - """ - part = linear_ARD_part(D,lengthscales) + part = linearpart(D,variances,ARD) return kern(D, [part]) def white(D,variance=1.): diff --git a/GPy/kern/linear.py b/GPy/kern/linear.py index 14c0ece3..4aaaa60d 100644 --- a/GPy/kern/linear.py +++ b/GPy/kern/linear.py @@ -1,124 +1,143 @@ # Copyright (c) 2012, GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) - from kernpart import kernpart import numpy as np + class linear(kernpart): """ Linear kernel + .. math:: + + k(x,y) = \sum_{i=1}^D \sigma^2_i x_iy_i + :param D: the number of input dimensions :type D: int - :param variance: variance - :type variance: None|float + :param variances: the vector of variances :math:`\sigma^2_i` + :type variances: np.ndarray of size (1,) or (D,) depending on ARD + :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single variance parameter \sigma^2), otherwise there is one variance parameter per dimension. + :type ARD: Boolean + :rtype: kernel object """ - def __init__(self, D, variance=None): + def __init__(self,D,variances=None,ARD=True): self.D = D - if variance is None: - variance = 1.0 - self.Nparam = 1 - self.name = 'linear' - self._set_params(variance) - self._Xcache, self._X2cache = np.empty(shape=(2,)) + self.ARD = ARD + if ARD == False: + self.Nparam = 1 + self.name = 'linear' + if variances is not None: + assert variances.shape == (1,) + else: + variances = np.ones(1) + self._Xcache, self._X2cache = np.empty(shape=(2,)) + else: + self.Nparam = self.D + self.name = 'linear_ARD' + if variances is not None: + assert variances.shape == (self.D,) + else: + variances = np.ones(self.D) + self._set_params(variances) def _get_params(self): - return self.variance + return self.variances def _set_params(self,x): - self.variance = x + assert x.size==(self.Nparam) + self.variances = x def _get_param_names(self): - return ['variance'] + if self.Nparam == 1: + return ['variance'] + else: + return ['variance_%i'%i for i in range(self.variances.size)] def K(self,X,X2,target): - self._K_computations(X, X2) - target += self.variance * self._dot_product - - def Kdiag(self,X,target): - np.add(target,np.sum(self.variance*np.square(X),-1),target) - - def dK_dtheta(self,partial,X,X2,target): - """ - Computes the derivatives wrt theta - Return shape is NxMx(Ntheta) - """ - self._K_computations(X, X2) - product = self._dot_product - # product = np.dot(X, X2.T) - target += np.sum(product*partial) - - def dK_dX(self,partial,X,X2,target): - target += self.variance * np.sum(partial[:,None,:]*X2.T[None,:,:],-1) - - def dKdiag_dtheta(self,partial,X,target): - target += np.sum(partial*np.square(X).sum(1)) + if self.ARD: + XX = X*np.sqrt(self.variances) + XX2 = X2*np.sqrt(self.variances) + target += np.dot(XX, XX2.T) + else: + self._K_computations(X, X2) + target += self.variances * self._dot_product def _K_computations(self,X,X2): - # (Nicolo) changed the logic here. If X2 is None, we want to cache - # (X,X). In practice X2 should always be passed. if X2 is None: X2 = X if not (np.all(X==self._Xcache) and np.all(X2==self._X2cache)): self._Xcache = X self._X2cache = X2 - self._dot_product = np.dot(X,X2.T) + self._dot_product = np.dot(X,X2.T) else: # print "Cache hit!" pass # TODO: insert debug message here (logging framework) + def Kdiag(self,X,target): + np.add(target,np.sum(self.variances*np.square(X),-1),target) - # def psi0(self,Z,mu,S,target): - # expected = np.square(mu) + S - # np.add(target,np.sum(self.variance*expected),target) + def dK_dtheta(self,partial,X,X2,target): + if self.ARD: + product = X[:,None,:]*X2[None,:,:] + target += (partial[:,:,None]*product).sum(0).sum(0) + else: + self._K_computations(X, X2) + target += np.sum(self._dot_product*partial) - # def dpsi0_dtheta(self,Z,mu,S,target): - # expected = np.square(mu) + S - # return -2.*np.sum(expected,0) + def dK_dX(self,partial,X,X2,target): + target += (((X2[:, None, :] * self.variances)) * partial[:,:, None]).sum(0) - # def dpsi0_dmuS(self,Z,mu,S,target_mu,target_S): - # np.add(target_mu,2*mu*self.variances,target_mu) - # np.add(target_S,self.variances,target_S) + def psi0(self,Z,mu,S,target): + expected = np.square(mu) + S + target += np.sum(self.variances*expected) - # def dpsi0_dZ(self,Z,mu,S,target): - # pass + def dpsi0_dtheta(self,Z,mu,S,target): + expected = np.square(mu) + S + return -2.*np.sum(expected,0) - # def psi1(self,Z,mu,S,target): - # """the variance, it does nothing""" - # self.K(mu,Z,target) + def dpsi0_dmuS(self,Z,mu,S,target_mu,target_S): + np.add(target_mu,2*mu*self.variances,target_mu) + np.add(target_S,self.variances,target_S) - # def dpsi1_dtheta(self,Z,mu,S,target): - # """the variance, it does nothing""" - # self.dK_dtheta(mu,Z,target) + def dpsi0_dZ(self,Z,mu,S,target): + pass - # def dpsi1_dmuS(self,Z,mu,S,target_mu,target_S): - # """Do nothing for S, it does not affect psi1""" - # np.add(target_mu,Z/self.variances2,target_mu) + def psi1(self,Z,mu,S,target): + """the variance, it does nothing""" + self.K(mu,Z,target) - # def dpsi1_dZ(self,Z,mu,S,target): - # self.dK_dX(mu,Z,target) + def dpsi1_dtheta(self,Z,mu,S,target): + """the variance, it does nothing""" + self.dK_dtheta(mu,Z,target) - # def psi2(self,Z,mu,S,target): - # """Think N,M,M,Q """ - # mu2_S = np.square(mu)+SN,Q, - # ZZ = Z[:,None,:]*Z[None,:,:] M,M,Q - # psi2 = ZZ*np.square(self.variances)*mu2_S - # np.add(target, psi2.sum(-1),target) M,M + def dpsi1_dmuS(self,Z,mu,S,target_mu,target_S): + """Do nothing for S, it does not affect psi1""" + np.add(target_mu,Z/self.variances2,target_mu) - # def dpsi2_dtheta(self,Z,mu,S,target): - # mu2_S = np.square(mu)+SN,Q, - # ZZ = Z[:,None,:]*Z[None,:,:] M,M,Q - # target += 2.*ZZ*mu2_S*self.variances + def dpsi1_dZ(self,Z,mu,S,target): + self.dK_dX(mu,Z,target) - # def dpsi2_dmuS(self,Z,mu,S,target_mu,target_S): - # """Think N,M,M,Q """ - # mu2_S = np.sum(np.square(mu)+S,0)Q, - # ZZ = Z[:,None,:]*Z[None,:,:] M,M,Q - # tmp = ZZ*np.square(self.variances) M,M,Q - # np.add(target_mu, tmp*2.*mu[:,None,None,:],target_mu) N,M,M,Q - # np.add(target_S, tmp, target_S) N,M,M,Q + def psi2(self,Z,mu,S,target): + """Think N,M,M,Q """ + mu2_S = np.square(mu)+S# N,Q, + ZZ = Z[:,None,:]*Z[None,:,:] # M,M,Q + psi2 = ZZ*np.square(self.variances)*mu2_S + np.add(target, psi2.sum(-1),target) # M,M - # def dpsi2_dZ(self,Z,mu,S,target): - # mu2_S = np.sum(np.square(mu)+S,0)Q, - # target += Z[:,None,:]*np.square(self.variances)*mu2_S + def dpsi2_dtheta(self,Z,mu,S,target): + mu2_S = np.square(mu)+S# N,Q, + ZZ = Z[:,None,:]*Z[None,:,:] # M,M,Q + target += 2.*ZZ*mu2_S*self.variances + + def dpsi2_dmuS(self,Z,mu,S,target_mu,target_S): + """Think N,M,M,Q """ + mu2_S = np.sum(np.square(mu)+S,0)# Q, + ZZ = Z[:,None,:]*Z[None,:,:] # M,M,Q + tmp = ZZ*np.square(self.variances) # M,M,Q + np.add(target_mu, tmp*2.*mu[:,None,None,:],target_mu) #N,M,M,Q + np.add(target_S, tmp, target_S) #N,M,M,Q + + def dpsi2_dZ(self,Z,mu,S,target): + mu2_S = np.sum(np.square(mu)+S,0)# Q, + target += Z[:,None,:]*np.square(self.variances)*mu2_S diff --git a/GPy/kern/linear_ARD.py b/GPy/kern/linear_ARD.py deleted file mode 100644 index b9112044..00000000 --- a/GPy/kern/linear_ARD.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) - - -from kernpart import kernpart -import numpy as np - -class linear_ARD(kernpart): - """ - Linear ARD kernel - - :param D: the number of input dimensions - :type D: int - :param variances: ARD variances - :type variances: None|np.ndarray - """ - - def __init__(self,D,variances=None): - self.D = D - if variances is not None: - assert variances.shape==(self.D,) - else: - variances = np.ones(self.D) - self.Nparam = int(self.D) - self.name = 'linear' - self._set_params(variances) - - def _get_params(self): - return self.variances - - def _set_params(self,x): - assert x.size==(self.Nparam) - self.variances = x - - def _get_param_names(self): - if self.D==1: - return ['variance'] - else: - return ['variance_%i'%i for i in range(self.variances.size)] - - def K(self,X,X2,target): - XX = X*np.sqrt(self.variances) - XX2 = X2*np.sqrt(self.variances) - target += np.dot(XX, XX2.T) - - def Kdiag(self,X,target): - np.add(target,np.sum(self.variances*np.square(X),-1),target) - - def dK_dtheta(self,partial,X,X2,target): - product = X[:,None,:]*X2[None,:,:] - target += (partial[:,:,None]*product).sum(0).sum(0) - - def dK_dX(self,partial,X,X2,target): - target += (((X2[:, None, :] * self.variances)) * partial[:,:, None]).sum(0) - - def psi0(self,Z,mu,S,target): - expected = np.square(mu) + S - np.add(target,np.sum(self.variances*expected),target) - - def dpsi0_dtheta(self,Z,mu,S,target): - expected = np.square(mu) + S - return -2.*np.sum(expected,0) - - def dpsi0_dmuS(self,Z,mu,S,target_mu,target_S): - np.add(target_mu,2*mu*self.variances,target_mu) - np.add(target_S,self.variances,target_S) - - def dpsi0_dZ(self,Z,mu,S,target): - pass - - def psi1(self,Z,mu,S,target): - """the variance, it does nothing""" - self.K(mu,Z,target) - - def dpsi1_dtheta(self,Z,mu,S,target): - """the variance, it does nothing""" - self.dK_dtheta(mu,Z,target) - - def dpsi1_dmuS(self,Z,mu,S,target_mu,target_S): - """Do nothing for S, it does not affect psi1""" - np.add(target_mu,Z/self.variances2,target_mu) - - def dpsi1_dZ(self,Z,mu,S,target): - self.dK_dX(mu,Z,target) - - def psi2(self,Z,mu,S,target): - """Think N,M,M,Q """ - mu2_S = np.square(mu)+S# N,Q, - ZZ = Z[:,None,:]*Z[None,:,:] # M,M,Q - psi2 = ZZ*np.square(self.variances)*mu2_S - np.add(target, psi2.sum(-1),target) # M,M - - def dpsi2_dtheta(self,Z,mu,S,target): - mu2_S = np.square(mu)+S# N,Q, - ZZ = Z[:,None,:]*Z[None,:,:] # M,M,Q - target += 2.*ZZ*mu2_S*self.variances - - def dpsi2_dmuS(self,Z,mu,S,target_mu,target_S): - """Think N,M,M,Q """ - mu2_S = np.sum(np.square(mu)+S,0)# Q, - ZZ = Z[:,None,:]*Z[None,:,:] # M,M,Q - tmp = ZZ*np.square(self.variances) # M,M,Q - np.add(target_mu, tmp*2.*mu[:,None,None,:],target_mu) #N,M,M,Q - np.add(target_S, tmp, target_S) #N,M,M,Q - - def dpsi2_dZ(self,Z,mu,S,target): - mu2_S = np.sum(np.square(mu)+S,0)# Q, - target += Z[:,None,:]*np.square(self.variances)*mu2_S diff --git a/doc/tuto_GP_regression.rst b/doc/tuto_GP_regression.rst new file mode 100644 index 00000000..e1c795ed --- /dev/null +++ b/doc/tuto_GP_regression.rst @@ -0,0 +1,105 @@ + +************************************* +Gaussian process regression tutorial +************************************* + +We will see in this tutorial the basics for building a 1 dimensional and a 2 dimensional Gaussian process model, also known as a kriging model. + +We first import the libraries we will need: :: + + import pylab as pb + pb.ion() + import numpy as np + import GPy + +1 dimensional model +=================== + +For this toy example, we assume we have the following inputs and outputs:: + + X = np.random.uniform(-3.,3.,(20,1)) + Y = np.sin(X) + np.random.randn(20,1)*0.05 + +Note that the observations Y include some noise. + +The first step is to define the covariance kernel we want to use for the model. We choose here a kernel based on Gaussian kernel (i.e. rbf or square exponential) plus some white noise:: + + Gaussian = GPy.kern.rbf(D=1) + noise = GPy.kern.white(D=1) + kernel = Gaussian + noise + +The parameter D stands for the dimension of the input space. Note that many other kernels are implemented such as: + +* linear (``GPy.kern.linear``) +* exponential kernel (``GPy.kern.exponential``) +* Matern 3/2 (``GPy.kern.Matern32``) +* Matern 5/2 (``GPy.kern.Matern52``) +* spline (``GPy.kern.spline``) +* and many others... + +The inputs required for building the model are the observations and the kernel:: + + m = GPy.models.GP_regression(X,Y,kernel) + +The functions ``print`` and ``plot`` can help us understand the model we have just build:: + + print m + m.plot() + +The default values of the kernel parameters may not be relevant for the current data (for example, the confidence intervals seems too wide on the previous figure). A common approach is find the values of the parameters that maximize the likelihood of the data. There are two steps for doing that with GPy: + +* Constrain the parameters of the kernel to ensure the kernel will always be a valid covariance structure (For example, we don\'t want some variances to be negative!). +* Run the optimization + +There are various ways to constrain the parameters of the kernel. The most basic is to constrain all the parameters to be positive:: + + m.constrain_positive('') + +but it is also possible to set a range on to constrain one parameter to be fixed. The parameter of ``m.constrain_positive`` is a regular expression that matches the name of the parameters to be constrained (as seen in ``print m``). For example, if we want the variance to be positive, the lengthscale to be in [1,10] and the noise variance to be fixed we can write:: + + #m.unconstrain('') # Required if the model has been previously constrained + m.constrain_positive('rbf_variance') + m.constrain_bounded('lengthscale',1.,10. ) + m.constrain_fixed('white',0.0025) + +Once the constrains have bee imposed, the model can be optimized:: + + m.optimize() + +If we want to perform some restarts to try to improve the result of the optimization, we can use the optimize_restart function:: + + m.optimize_restarts(Nrestarts = 10) + m.plot() + print(m) + +2 dimensional example +===================== + +Here is a 2 dimensional example:: + + import pylab as pb + pb.ion() + import numpy as np + import GPy + + # sample inputs and outputs + X = np.random.uniform(-3.,3.,(50,2)) + Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(50,1)*0.05 + + # define kernel + ker = GPy.kern.Matern52(2,ARD=True) + GPy.kern.white(2) + + # create simple GP model + m = GPy.models.GP_regression(X,Y,ker) + + # contrain all parameters to be positive + m.constrain_positive('') + + # optimize and plot + pb.figure() + m.optimize('tnc', max_f_eval = 1000) + + m.plot() + print(m) + +The flag ``ARD=True`` in the definition of the Matern kernel specifies that we want one lengthscale parameter per dimension (ie the GP is not isotropic).