diff --git a/.gitignore b/.gitignore index 60866848..7ca1dd25 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,6 @@ dist build eggs -parts bin var sdist @@ -39,3 +38,11 @@ nosetests.xml #bfgs optimiser leaves this lying around iterate.dat + +# Nosetests # +############# +*.noseids + +# git merge files # +################### +*.orig diff --git a/.travis.yml b/.travis.yml index 6d188401..1f796285 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,15 +7,20 @@ virtualenv: system_site_packages: true # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors -before_install: +before_install: - sudo apt-get install -qq python-scipy python-pip - sudo apt-get install -qq python-matplotlib + # Workaround for a permissions issue with Travis virtual machine images + # that breaks Python's multiprocessing: + # https://github.com/travis-ci/travis-cookbooks/issues/155 + - sudo rm -rf /dev/shm + - sudo ln -s /run/shm /dev/shm install: - - pip install --upgrade numpy==1.7.1 - - pip install sphinx + - pip install --upgrade numpy==1.7.1 + - pip install sphinx - pip install nose - pip install . --use-mirrors # command to run tests, e.g. python setup.py test -script: +script: - nosetests GPy/testing diff --git a/AUTHORS.txt b/AUTHORS.txt new file mode 100644 index 00000000..f81db5ec --- /dev/null +++ b/AUTHORS.txt @@ -0,0 +1,7 @@ +James Hensman +Nicolo Fusi +Ricardo Andrade +Nicolas Durrande +Alan Saul +Max Zwiessele +Neil D. Lawrence diff --git a/GPy/__init__.py b/GPy/__init__.py new file mode 100644 index 00000000..f35fda78 --- /dev/null +++ b/GPy/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) +import warnings +warnings.filterwarnings("ignore", category=DeprecationWarning) + +import core +import models +import mappings +import inference +import util +import examples +import likelihoods +import testing +from numpy.testing import Tester +from nose.tools import nottest +import kern +from core import priors + +@nottest +def tests(): + Tester(testing).test(verbose=10) diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py new file mode 100644 index 00000000..9813d5ae --- /dev/null +++ b/GPy/core/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +from model import * +from parameterized import * +import priors +from gp import GP +from sparse_gp import SparseGP +from fitc import FITC +from svigp import SVIGP +from mapping import * diff --git a/GPy/core/domains.py b/GPy/core/domains.py new file mode 100644 index 00000000..cefac6c2 --- /dev/null +++ b/GPy/core/domains.py @@ -0,0 +1,26 @@ +''' +Created on 4 Jun 2013 + +@author: maxz + +(Hyper-)Parameter domains defined for :py:mod:`~GPy.core.priors` and :py:mod:`~GPy.kern`. +These domains specify the legitimate realm of the parameters to live in. + +:const:`~GPy.core.domains.REAL` : + real domain, all values in the real numbers are allowed + +:const:`~GPy.core.domains.POSITIVE`: + positive domain, only positive real values are allowed + +:const:`~GPy.core.domains.NEGATIVE`: + same as :const:`~GPy.core.domains.POSITIVE`, but only negative values are allowed + +:const:`~GPy.core.domains.BOUNDED`: + only values within the bounded range are allowed, + the bounds are specified withing the object with the bounded range +''' + +REAL = 'real' +POSITIVE = "positive" +NEGATIVE = 'negative' +BOUNDED = 'bounded' diff --git a/GPy/core/fitc.py b/GPy/core/fitc.py new file mode 100644 index 00000000..ef171459 --- /dev/null +++ b/GPy/core/fitc.py @@ -0,0 +1,247 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +import pylab as pb +from ..util.linalg import mdot, jitchol, chol_inv, tdot, symmetrify, pdinv, dtrtrs +from ..util.plot import gpplot +from .. import kern +from scipy import stats +from sparse_gp import SparseGP + +class FITC(SparseGP): + """ + sparse FITC approximation + + :param X: inputs + :type X: np.ndarray (num_data x Q) + :param likelihood: a likelihood instance, containing the observed data + :type likelihood: GPy.likelihood.(Gaussian | EP) + :param kernel : the kernel (covariance function). See link kernels + :type kernel: a GPy.kern.kern instance + :param Z: inducing inputs (optional, see note) + :type Z: np.ndarray (M x Q) | None + :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) + :type normalize_(X|Y): bool + """ + + def __init__(self, X, likelihood, kernel, Z, normalize_X=False): + SparseGP.__init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False) + assert self.output_dim == 1, "FITC model is not defined for handling multiple outputs" + + def update_likelihood_approximation(self): + """ + Approximates a non-Gaussian likelihood using Expectation Propagation + + For a Gaussian likelihood, no iteration is required: + this function does nothing + """ + self.likelihood.restart() + self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0) + self._set_params(self._get_params()) + + def _compute_kernel_matrices(self): + # kernel computations, using BGPLVM notation + self.Kmm = self.kern.K(self.Z) + self.psi0 = self.kern.Kdiag(self.X) + self.psi1 = self.kern.K(self.Z, self.X) + self.psi2 = None + + def _computations(self): + #factor Kmm + self.Lm = jitchol(self.Kmm) + self.Lmi,info = dtrtrs(self.Lm,np.eye(self.num_inducing),lower=1) + Lmipsi1 = np.dot(self.Lmi,self.psi1) + self.Qnn = np.dot(Lmipsi1.T,Lmipsi1).copy() + self.Diag0 = self.psi0 - np.diag(self.Qnn) + self.beta_star = self.likelihood.precision/(1. + self.likelihood.precision*self.Diag0[:,None]) #NOTE: beta_star contains Diag0 and the precision + self.V_star = self.beta_star * self.likelihood.Y + + # The rather complex computations of self.A + tmp = self.psi1 * (np.sqrt(self.beta_star.flatten().reshape(1, self.num_data))) + tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1) + self.A = tdot(tmp) + + # factor B + self.B = np.eye(self.num_inducing) + self.A + self.LB = jitchol(self.B) + self.LBi = chol_inv(self.LB) + self.psi1V = np.dot(self.psi1, self.V_star) + + Lmi_psi1V, info = dtrtrs(self.Lm, np.asfortranarray(self.psi1V), lower=1, trans=0) + self._LBi_Lmi_psi1V, _ = dtrtrs(self.LB, np.asfortranarray(Lmi_psi1V), lower=1, trans=0) + + Kmmipsi1 = np.dot(self.Lmi.T,Lmipsi1) + b_psi1_Ki = self.beta_star * Kmmipsi1.T + Ki_pbp_Ki = np.dot(Kmmipsi1,b_psi1_Ki) + Kmmi = np.dot(self.Lmi.T,self.Lmi) + LBiLmi = np.dot(self.LBi,self.Lmi) + LBL_inv = np.dot(LBiLmi.T,LBiLmi) + VVT = np.outer(self.V_star,self.V_star) + VV_p_Ki = np.dot(VVT,Kmmipsi1.T) + Ki_pVVp_Ki = np.dot(Kmmipsi1,VV_p_Ki) + psi1beta = self.psi1*self.beta_star.T + H = self.Kmm + mdot(self.psi1,psi1beta.T) + LH = jitchol(H) + LHi = chol_inv(LH) + Hi = np.dot(LHi.T,LHi) + + betapsi1TLmiLBi = np.dot(psi1beta.T,LBiLmi.T) + alpha = np.array([np.dot(a.T,a) for a in betapsi1TLmiLBi])[:,None] + gamma_1 = mdot(VVT,self.psi1.T,Hi) + pHip = mdot(self.psi1.T,Hi,self.psi1) + gamma_2 = mdot(self.beta_star*pHip,self.V_star) + gamma_3 = self.V_star * gamma_2 + + self._dL_dpsi0 = -0.5 * self.beta_star#dA_dpsi0: logdet(self.beta_star) + self._dL_dpsi0 += .5 * self.V_star**2 #dA_psi0: yT*beta_star*y + self._dL_dpsi0 += .5 *alpha #dC_dpsi0 + self._dL_dpsi0 += 0.5*mdot(self.beta_star*pHip,self.V_star)**2 - self.V_star * mdot(self.V_star.T,pHip*self.beta_star).T #dD_dpsi0 + + self._dL_dpsi1 = b_psi1_Ki.copy() #dA_dpsi1: logdet(self.beta_star) + self._dL_dpsi1 += -np.dot(psi1beta.T,LBL_inv) #dC_dpsi1 + self._dL_dpsi1 += gamma_1 - mdot(psi1beta.T,Hi,self.psi1,gamma_1) #dD_dpsi1 + + self._dL_dKmm = -0.5 * np.dot(Kmmipsi1,b_psi1_Ki) #dA_dKmm: logdet(self.beta_star) + self._dL_dKmm += .5*(LBL_inv - Kmmi) + mdot(LBL_inv,psi1beta,Kmmipsi1.T) #dC_dKmm + self._dL_dKmm += -.5 * mdot(Hi,self.psi1,gamma_1) #dD_dKmm + + self._dpsi1_dtheta = 0 + self._dpsi1_dX = 0 + self._dKmm_dtheta = 0 + self._dKmm_dX = 0 + + self._dpsi1_dX_jkj = 0 + self._dpsi1_dtheta_jkj = 0 + + for i,V_n,alpha_n,gamma_n,gamma_k in zip(range(self.num_data),self.V_star,alpha,gamma_2,gamma_3): + K_pp_K = np.dot(Kmmipsi1[:,i:(i+1)],Kmmipsi1[:,i:(i+1)].T) + _dpsi1 = (-V_n**2 - alpha_n + 2.*gamma_k - gamma_n**2) * Kmmipsi1.T[i:(i+1),:] + _dKmm = .5*(V_n**2 + alpha_n + gamma_n**2 - 2.*gamma_k) * K_pp_K #Diag_dD_dKmm + self._dpsi1_dtheta += self.kern.dK_dtheta(_dpsi1,self.X[i:i+1,:],self.Z) + self._dKmm_dtheta += self.kern.dK_dtheta(_dKmm,self.Z) + self._dKmm_dX += 2.*self.kern.dK_dX(_dKmm ,self.Z) + self._dpsi1_dX += self.kern.dK_dX(_dpsi1.T,self.Z,self.X[i:i+1,:]) + + # the partial derivative vector for the likelihood + if self.likelihood.Nparams == 0: + # save computation here. + self.partial_for_likelihood = None + elif self.likelihood.is_heteroscedastic: + raise NotImplementedError, "heteroscedatic derivates not implemented." + else: + # likelihood is not heterscedatic + dbstar_dnoise = self.likelihood.precision * (self.beta_star**2 * self.Diag0[:,None] - self.beta_star) + Lmi_psi1 = mdot(self.Lmi,self.psi1) + LBiLmipsi1 = np.dot(self.LBi,Lmi_psi1) + aux_0 = np.dot(self._LBi_Lmi_psi1V.T,LBiLmipsi1) + aux_1 = self.likelihood.Y.T * np.dot(self._LBi_Lmi_psi1V.T,LBiLmipsi1) + aux_2 = np.dot(LBiLmipsi1.T,self._LBi_Lmi_psi1V) + + dA_dnoise = 0.5 * self.input_dim * (dbstar_dnoise/self.beta_star).sum() - 0.5 * self.input_dim * np.sum(self.likelihood.Y**2 * dbstar_dnoise) + dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) * Lmi_psi1 * dbstar_dnoise.T) + dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) * Lmi_psi1 * dbstar_dnoise.T) + + dD_dnoise_1 = mdot(self.V_star*LBiLmipsi1.T,LBiLmipsi1*dbstar_dnoise.T*self.likelihood.Y.T) + alpha = mdot(LBiLmipsi1,self.V_star) + alpha_ = mdot(LBiLmipsi1.T,alpha) + dD_dnoise_2 = -0.5 * self.input_dim * np.sum(alpha_**2 * dbstar_dnoise ) + + dD_dnoise_1 = mdot(self.V_star.T,self.psi1.T,self.Lmi.T,self.LBi.T,self.LBi,self.Lmi,self.psi1,dbstar_dnoise*self.likelihood.Y) + dD_dnoise_2 = 0.5*mdot(self.V_star.T,self.psi1.T,Hi,self.psi1,dbstar_dnoise*self.psi1.T,Hi,self.psi1,self.V_star) + dD_dnoise = dD_dnoise_1 + dD_dnoise_2 + + self.partial_for_likelihood = dA_dnoise + dC_dnoise + dD_dnoise + + def log_likelihood(self): + """ Compute the (lower bound on the) log marginal likelihood """ + A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.beta_star)) - 0.5 * np.sum(self.V_star * self.likelihood.Y) + C = -self.output_dim * (np.sum(np.log(np.diag(self.LB)))) + D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V)) + return A + C + D + + def _log_likelihood_gradients(self): + pass + return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood))) + + def dL_dtheta(self): + dL_dtheta = self.kern.dKdiag_dtheta(self._dL_dpsi0,self.X) + dL_dtheta += self.kern.dK_dtheta(self._dL_dpsi1,self.X,self.Z) + dL_dtheta += self.kern.dK_dtheta(self._dL_dKmm,X=self.Z) + dL_dtheta += self._dKmm_dtheta + dL_dtheta += self._dpsi1_dtheta + return dL_dtheta + + def dL_dZ(self): + dL_dZ = self.kern.dK_dX(self._dL_dpsi1.T,self.Z,self.X) + dL_dZ += 2. * self.kern.dK_dX(self._dL_dKmm,X=self.Z) + dL_dZ += self._dpsi1_dX + dL_dZ += self._dKmm_dX + return dL_dZ + + def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False): + assert X_variance_new is None, "FITC model is not defined for handling uncertain inputs." + + if self.likelihood.is_heteroscedastic: + Iplus_Dprod_i = 1./(1.+ self.Diag0 * self.likelihood.precision.flatten()) + self.Diag = self.Diag0 * Iplus_Dprod_i + self.P = Iplus_Dprod_i[:,None] * self.psi1.T + self.RPT0 = np.dot(self.Lmi,self.psi1) + self.L = np.linalg.cholesky(np.eye(self.num_inducing) + np.dot(self.RPT0,((1. - Iplus_Dprod_i)/self.Diag0)[:,None]*self.RPT0.T)) + self.R,info = dtrtrs(self.L,self.Lmi,lower=1) + self.RPT = np.dot(self.R,self.P.T) + self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT) + self.w = self.Diag * self.likelihood.v_tilde + self.Gamma = np.dot(self.R.T, np.dot(self.RPT,self.likelihood.v_tilde)) + self.mu = self.w + np.dot(self.P,self.Gamma) + + """ + Make a prediction for the generalized FITC model + + Arguments + --------- + X : Input prediction data - Nx1 numpy array (floats) + """ + # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T) + + # Ci = I + (RPT0)Di(RPT0).T + # C = I - [RPT0] * (input_dim+[RPT0].T*[RPT0])^-1*[RPT0].T + # = I - [RPT0] * (input_dim + self.Qnn)^-1 * [RPT0].T + # = I - [RPT0] * (U*U.T)^-1 * [RPT0].T + # = I - V.T * V + U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn) + V,info = dtrtrs(U,self.RPT0.T,lower=1) + C = np.eye(self.num_inducing) - np.dot(V.T,V) + mu_u = np.dot(C,self.RPT0)*(1./self.Diag0[None,:]) + #self.C = C + #self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T + #self.mu_u = mu_u + #self.U = U + # q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T) + mu_H = np.dot(mu_u,self.mu) + self.mu_H = mu_H + Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T)) + # q(f_star|y) = N(f_star|mu_star,sigma2_star) + Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts) + KR0T = np.dot(Kx.T,self.Lmi.T) + mu_star = np.dot(KR0T,mu_H) + if full_cov: + Kxx = self.kern.K(Xnew,which_parts=which_parts) + var = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T)) + else: + Kxx = self.kern.Kdiag(Xnew,which_parts=which_parts) + var = (Kxx + np.sum(KR0T.T*np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T),0))[:,None] + return mu_star[:,None],var + else: + raise NotImplementedError, "Heteroscedastic case not implemented." + """ + Kx = self.kern.K(self.Z, Xnew) + mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V) + if full_cov: + Kxx = self.kern.K(Xnew) + var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting + else: + Kxx = self.kern.Kdiag(Xnew) + var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0) + return mu,var[:,None] + """ diff --git a/GPy/core/gp.py b/GPy/core/gp.py new file mode 100644 index 00000000..278ddc74 --- /dev/null +++ b/GPy/core/gp.py @@ -0,0 +1,157 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +import pylab as pb +from .. import kern +from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs +from ..likelihoods import EP +from gp_base import GPBase + +class GP(GPBase): + """ + Gaussian Process model for regression and EP + + :param X: input observations + :param kernel: a GPy kernel, defaults to rbf+white + :parm likelihood: a GPy likelihood + :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_X: False|True + :rtype: model object + :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 + :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] + :type powerep: list + + .. Note:: Multiple independent outputs are allowed using columns of Y + + """ + def __init__(self, X, likelihood, kernel, normalize_X=False): + GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X) + self._set_params(self._get_params()) + + def getstate(self): + return GPBase.getstate(self) + + def setstate(self, state): + GPBase.setstate(self, state) + self._set_params(self._get_params()) + + def _set_params(self, p): + self.kern._set_params_transformed(p[:self.kern.num_params_transformed()]) + self.likelihood._set_params(p[self.kern.num_params_transformed():]) + + self.K = self.kern.K(self.X) + self.K += self.likelihood.covariance_matrix + + self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) + + # the gradient of the likelihood wrt the covariance matrix + if self.likelihood.YYT is None: + # alpha = np.dot(self.Ki, self.likelihood.Y) + alpha, _ = dpotrs(self.L, self.likelihood.Y, lower=1) + + self.dL_dK = 0.5 * (tdot(alpha) - self.output_dim * self.Ki) + else: + # tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki) + tmp, _ = dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1) + tmp, _ = dpotrs(self.L, np.asfortranarray(tmp.T), lower=1) + self.dL_dK = 0.5 * (tmp - self.output_dim * self.Ki) + + def _get_params(self): + return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params())) + + + def _get_param_names(self): + return self.kern._get_param_names_transformed() + self.likelihood._get_param_names() + + def update_likelihood_approximation(self): + """ + Approximates a non-gaussian likelihood using Expectation Propagation + + For a Gaussian likelihood, no iteration is required: + this function does nothing + """ + self.likelihood.restart() + self.likelihood.fit_full(self.kern.K(self.X)) + self._set_params(self._get_params()) # update the GP + + def _model_fit_term(self): + """ + Computes the model fit using YYT if it's available + """ + if self.likelihood.YYT is None: + tmp, _ = dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1) + return -0.5 * np.sum(np.square(tmp)) + # return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y))) + else: + return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT)) + + def log_likelihood(self): + """ + The log marginal likelihood of the GP. + + For an EP model, can be written as the log likelihood of a regression + model for a new variable Y* = v_tilde/tau_tilde, with a covariance + matrix K* = K + diag(1./tau_tilde) plus a normalization term. + """ + return (-0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) - + 0.5 * self.output_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z) + + + def _log_likelihood_gradients(self): + """ + The gradient of all parameters. + + Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta + """ + return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) + + def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False): + """ + Internal helper function for making predictions, does not account + for normalization or likelihood + """ + Kx = self.kern.K(_Xnew, self.X, which_parts=which_parts).T + # KiKx = np.dot(self.Ki, Kx) + KiKx, _ = dpotrs(self.L, np.asfortranarray(Kx), lower=1) + mu = np.dot(KiKx.T, self.likelihood.Y) + if full_cov: + Kxx = self.kern.K(_Xnew, which_parts=which_parts) + var = Kxx - np.dot(KiKx.T, Kx) + else: + Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts) + var = Kxx - np.sum(np.multiply(KiKx, Kx), 0) + var = var[:, None] + if stop: + debug_this # @UndefinedVariable + return mu, var + + def predict(self, Xnew, which_parts='all', full_cov=False, likelihood_args=dict()): + """ + Predict the function(s) at the new point(s) Xnew. + Arguments + --------- + :param Xnew: The points at which to make a prediction + :type Xnew: np.ndarray, Nnew x self.input_dim + :param which_parts: specifies which outputs kernel(s) to use in prediction + :type which_parts: ('all', list of bools) + :param full_cov: whether to return the folll covariance matrix, or just the diagonal + :type full_cov: bool + :rtype: posterior mean, a Numpy array, Nnew x self.input_dim + :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise + :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays, Nnew x self.input_dim + + + If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew. + This is to allow for different normalizations of the output dimensions. + + """ + # normalize X values + Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale + mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts) + + # now push through likelihood + mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args) + + return mean, var, _025pm, _975pm diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py new file mode 100644 index 00000000..6935fc6f --- /dev/null +++ b/GPy/core/gp_base.py @@ -0,0 +1,209 @@ +import numpy as np +from .. import kern +from ..util.plot import gpplot, Tango, x_frame1D, x_frame2D +import pylab as pb +from GPy.core.model import Model + +class GPBase(Model): + """ + Gaussian process base model for holding shared behaviour between + sparse_GP and GP models. + """ + + def __init__(self, X, likelihood, kernel, normalize_X=False): + self.X = X + assert len(self.X.shape) == 2 + self.num_data, self.input_dim = self.X.shape + assert isinstance(kernel, kern.kern) + self.kern = kernel + self.likelihood = likelihood + assert self.X.shape[0] == self.likelihood.data.shape[0] + self.num_data, self.output_dim = self.likelihood.data.shape + + if normalize_X: + self._Xoffset = X.mean(0)[None, :] + self._Xscale = X.std(0)[None, :] + self.X = (X.copy() - self._Xoffset) / self._Xscale + else: + self._Xoffset = np.zeros((1, self.input_dim)) + self._Xscale = np.ones((1, self.input_dim)) + + super(GPBase, self).__init__() + # Model.__init__(self) + # All leaf nodes should call self._set_params(self._get_params()) at + # the end + + def getstate(self): + """ + Get the current state of the class, here we return everything that is needed to recompute the model. + """ + return Model.getstate(self) + [self.X, + self.num_data, + self.input_dim, + self.kern, + self.likelihood, + self.output_dim, + self._Xoffset, + self._Xscale] + + def setstate(self, state): + self._Xscale = state.pop() + self._Xoffset = state.pop() + self.output_dim = state.pop() + self.likelihood = state.pop() + self.kern = state.pop() + self.input_dim = state.pop() + self.num_data = state.pop() + self.X = state.pop() + Model.setstate(self, state) + + def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None): + """ + Plot the GP's view of the world, where the data is normalized and the + likelihood is Gaussian. + + Plot the posterior of the GP. + - In one dimension, the function is plotted with a shaded region identifying two standard deviations. + - In two dimsensions, a contour-plot shows the mean predicted function + - In higher dimensions, we've no implemented this yet !TODO! + + Can plot only part of the data and part of the posterior functions + using which_data and which_functions + + :param samples: the number of a posteriori samples to plot + :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits + :param which_data: which if the training data to plot (default all) + :type which_data: 'all' or a slice object to slice self.X, self.Y + :param which_parts: which of the kernel functions to plot (additively) + :type which_parts: 'all', or list of bools + :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D + :type resolution: int + :param full_cov: + :type full_cov: bool + :param fignum: figure to plot on. + :type fignum: figure number + :param ax: axes to plot on. + :type ax: axes handle + + """ + if which_data == 'all': + which_data = slice(None) + + if ax is None: + fig = pb.figure(num=fignum) + ax = fig.add_subplot(111) + + if self.X.shape[1] == 1: + Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits) + if samples == 0: + m, v = self._raw_predict(Xnew, which_parts=which_parts) + gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v), axes=ax) + ax.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5) + else: + m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True) + Ysim = np.random.multivariate_normal(m.flatten(), v, samples) + gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None, ], axes=ax) + for i in range(samples): + ax.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25) + ax.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5) + ax.set_xlim(xmin, xmax) + ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None]))) + ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin) + ax.set_ylim(ymin, ymax) + + elif self.X.shape[1] == 2: + resolution = resolution or 50 + Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution) + m, v = self._raw_predict(Xnew, which_parts=which_parts) + m = m.reshape(resolution, resolution).T + ax.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable + ax.scatter(self.X[:, 0], self.X[:, 1], 40, self.likelihood.Y, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max()) # @UndefinedVariable + ax.set_xlim(xmin[0], xmax[0]) + ax.set_ylim(xmin[1], xmax[1]) + else: + raise NotImplementedError, "Cannot define a frame with more than two input dimensions" + + def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']): + """ + Plot the GP with noise where the likelihood is Gaussian. + + Plot the posterior of the GP. + - In one dimension, the function is plotted with a shaded region identifying two standard deviations. + - In two dimsensions, a contour-plot shows the mean predicted function + - In higher dimensions, we've no implemented this yet !TODO! + + Can plot only part of the data and part of the posterior functions + using which_data and which_functions + + :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits + :type plot_limits: np.array + :param which_data: which if the training data to plot (default all) + :type which_data: 'all' or a slice object to slice self.X, self.Y + :param which_parts: which of the kernel functions to plot (additively) + :type which_parts: 'all', or list of bools + :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D + :type resolution: int + :param levels: number of levels to plot in a contour plot. + :type levels: int + :param samples: the number of a posteriori samples to plot + :type samples: int + :param fignum: figure to plot on. + :type fignum: figure number + :param ax: axes to plot on. + :type ax: axes handle + :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v. + :type fixed_inputs: a list of tuples + :param linecol: color of line to plot. + :type linecol: + :param fillcol: color of fill + :type fillcol: + :param levels: for 2D plotting, the number of contour levels to use + is ax is None, create a new figure + + """ + # TODO include samples + if which_data == 'all': + which_data = slice(None) + + if ax is None: + fig = pb.figure(num=fignum) + ax = fig.add_subplot(111) + + plotdims = self.input_dim - len(fixed_inputs) + + if plotdims == 1: + + Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now + + fixed_dims = np.array([i for i,v in fixed_inputs]) + freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims) + + Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits) + Xgrid = np.empty((Xnew.shape[0],self.input_dim)) + Xgrid[:,freedim] = Xnew + for i,v in fixed_inputs: + Xgrid[:,i] = v + + m, _, lower, upper = self.predict(Xgrid, which_parts=which_parts) + for d in range(m.shape[1]): + gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol) + ax.plot(Xu[which_data,freedim], self.likelihood.data[which_data, d], 'kx', mew=1.5) + ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper)) + ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin) + ax.set_xlim(xmin, xmax) + ax.set_ylim(ymin, ymax) + + elif self.X.shape[1] == 2: # FIXME + resolution = resolution or 50 + Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution) + x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution) + m, _, lower, upper = self.predict(Xnew, which_parts=which_parts) + m = m.reshape(resolution, resolution).T + ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable + Yf = self.likelihood.data.flatten() + ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) # @UndefinedVariable + ax.set_xlim(xmin[0], xmax[0]) + ax.set_ylim(xmin[1], xmax[1]) + + else: + raise NotImplementedError, "Cannot define a frame with more than two input dimensions" diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py new file mode 100644 index 00000000..02b9664a --- /dev/null +++ b/GPy/core/mapping.py @@ -0,0 +1,190 @@ +# Copyright (c) 2013, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +from ..util.plot import Tango, x_frame1D, x_frame2D +from parameterized import Parameterized +import numpy as np +import pylab as pb + +class Mapping(Parameterized): + """ + Base model for shared behavior between models that can act like a mapping. + """ + + def __init__(self, input_dim, output_dim): + self.input_dim = input_dim + self.output_dim = output_dim + + super(Mapping, self).__init__() + # Model.__init__(self) + # All leaf nodes should call self._set_params(self._get_params()) at + # the end + + def f(self, X): + raise NotImplementedError + + def df_dX(self, dL_df, X): + """Evaluate derivatives of mapping outputs with respect to inputs. + + :param dL_df: gradient of the objective with respect to the function. + :type dL_df: ndarray (num_data x output_dim) + :param X: the input locations where derivatives are to be evaluated. + :type X: ndarray (num_data x input_dim) + :returns: matrix containing gradients of the function with respect to the inputs. + """ + raise NotImplementedError + + def df_dtheta(self, dL_df, X): + """The gradient of the outputs of the multi-layer perceptron with respect to each of the parameters. + + :param dL_df: gradient of the objective with respect to the function. + :type dL_df: ndarray (num_data x output_dim) + :param X: input locations where the function is evaluated. + :type X: ndarray (num_data x input_dim) + :returns: Matrix containing gradients with respect to parameters of each output for each input data. + :rtype: ndarray (num_params length) + """ + + raise NotImplementedError + + def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue']): + """ + Plot the mapping. + + Plots the mapping associated with the model. + - In one dimension, the function is plotted. + - In two dimsensions, a contour-plot shows the function + - In higher dimensions, we've not implemented this yet !TODO! + + Can plot only part of the data and part of the posterior functions + using which_data and which_functions + + :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits + :type plot_limits: np.array + :param which_data: which if the training data to plot (default all) + :type which_data: 'all' or a slice object to slice self.X, self.Y + :param which_parts: which of the kernel functions to plot (additively) + :type which_parts: 'all', or list of bools + :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D + :type resolution: int + :param levels: number of levels to plot in a contour plot. + :type levels: int + :param samples: the number of a posteriori samples to plot + :type samples: int + :param fignum: figure to plot on. + :type fignum: figure number + :param ax: axes to plot on. + :type ax: axes handle + :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v. + :type fixed_inputs: a list of tuples + :param linecol: color of line to plot. + :type linecol: + :param levels: for 2D plotting, the number of contour levels to use + is ax is None, create a new figure + + """ + # TODO include samples + if which_data == 'all': + which_data = slice(None) + + if ax is None: + fig = pb.figure(num=fignum) + ax = fig.add_subplot(111) + + plotdims = self.input_dim - len(fixed_inputs) + + if plotdims == 1: + + Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now + + fixed_dims = np.array([i for i,v in fixed_inputs]) + freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims) + + Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits) + Xgrid = np.empty((Xnew.shape[0],self.input_dim)) + Xgrid[:,freedim] = Xnew + for i,v in fixed_inputs: + Xgrid[:,i] = v + + f = self.predict(Xgrid, which_parts=which_parts) + for d in range(y.shape[1]): + ax.plot(Xnew, f[:, d], edgecol=linecol) + + elif self.X.shape[1] == 2: + resolution = resolution or 50 + Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution) + x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution) + f = self.predict(Xnew, which_parts=which_parts) + m = m.reshape(resolution, resolution).T + ax.contour(x, y, f, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable + ax.set_xlim(xmin[0], xmax[0]) + ax.set_ylim(xmin[1], xmax[1]) + + else: + raise NotImplementedError, "Cannot define a frame with more than two input dimensions" + +from GPy.core.model import Model + +class Mapping_check_model(Model): + """This is a dummy model class used as a base class for checking that the gradients of a given mapping are implemented correctly. It enables checkgradient() to be called independently on each mapping.""" + def __init__(self, mapping=None, dL_df=None, X=None): + num_samples = 20 + if mapping==None: + mapping = GPy.mapping.linear(1, 1) + if X==None: + X = np.random.randn(num_samples, mapping.input_dim) + if dL_df==None: + dL_df = np.ones((num_samples, mapping.output_dim)) + + self.mapping=mapping + self.X = X + self.dL_df = dL_df + self.num_params = self.mapping.num_params + Model.__init__(self) + + + def _get_params(self): + return self.mapping._get_params() + + def _get_param_names(self): + return self.mapping._get_param_names() + + def _set_params(self, x): + self.mapping._set_params(x) + + def log_likelihood(self): + return (self.dL_df*self.mapping.f(self.X)).sum() + + def _log_likelihood_gradients(self): + raise NotImplementedError, "This needs to be implemented to use the Mapping_check_model class." + +class Mapping_check_df_dtheta(Mapping_check_model): + """This class allows gradient checks for the gradient of a mapping with respect to parameters. """ + def __init__(self, mapping=None, dL_df=None, X=None): + Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X) + + def _log_likelihood_gradients(self): + return self.mapping.df_dtheta(self.dL_df, self.X) + + +class Mapping_check_df_dX(Mapping_check_model): + """This class allows gradient checks for the gradient of a mapping with respect to X. """ + def __init__(self, mapping=None, dL_df=None, X=None): + Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X) + + if dL_df==None: + dL_df = np.ones((self.X.shape[0],self.mapping.output_dim)) + self.num_params = self.X.shape[0]*self.mapping.input_dim + + def _log_likelihood_gradients(self): + return self.mapping.df_dX(self.dL_df, self.X).flatten() + + def _get_param_names(self): + return ['X_' +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])] + + def _get_params(self): + return self.X.flatten() + + def _set_params(self, x): + self.X=x.reshape(self.X.shape) + diff --git a/GPy/core/model.py b/GPy/core/model.py new file mode 100644 index 00000000..c31ea209 --- /dev/null +++ b/GPy/core/model.py @@ -0,0 +1,581 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +from .. import likelihoods +from ..inference import optimization +from ..util.linalg import jitchol +from GPy.util.misc import opt_wrapper +from parameterized import Parameterized +import multiprocessing as mp +import numpy as np +from GPy.core.domains import POSITIVE, REAL +from numpy.linalg.linalg import LinAlgError +# import numdifftools as ndt + +class Model(Parameterized): + _fail_count = 0 # Count of failed optimization steps (see objective) + _allowed_failures = 10 # number of allowed failures + def __init__(self): + Parameterized.__init__(self) + self.priors = None + self.optimization_runs = [] + self.sampling_runs = [] + self.preferred_optimizer = 'scg' + # self._set_params(self._get_params()) has been taken out as it should only be called on leaf nodes + def log_likelihood(self): + raise NotImplementedError, "this needs to be implemented to use the model class" + def _log_likelihood_gradients(self): + raise NotImplementedError, "this needs to be implemented to use the model class" + + def getstate(self): + """ + Get the current state of the class. + + Inherited from Parameterized, so add those parameters to the state + :return: list of states from the model. + + """ + return Parameterized.getstate(self) + \ + [self.priors, self.optimization_runs, + self.sampling_runs, self.preferred_optimizer] + + def setstate(self, state): + """ + set state from previous call to getstate + call Parameterized with the rest of the state + + :param state: the state of the model. + :type state: list as returned from getstate. + """ + self.preferred_optimizer = state.pop() + self.sampling_runs = state.pop() + self.optimization_runs = state.pop() + self.priors = state.pop() + Parameterized.setstate(self, state) + + def set_prior(self, regexp, what): + """ + Sets priors on the model parameters. + + Notes + ----- + Asserts that the prior is suitable for the constraint. If the + wrong constraint is in place, an error is raised. If no + constraint is in place, one is added (warning printed). + + For tied parameters, the prior will only be "counted" once, thus + a prior object is only inserted on the first tied index + + :param regexp: regular expression of parameters on which priors need to be set. + :type param: string, regexp, or integer array + :param what: prior to set on parameter. + :type what: GPy.core.Prior type + + """ + if self.priors is None: + self.priors = [None for i in range(self._get_params().size)] + + which = self.grep_param_names(regexp) + + # check tied situation + tie_partial_matches = [tie for tie in self.tied_indices if (not set(tie).isdisjoint(set(which))) & (not set(tie) == set(which))] + if len(tie_partial_matches): + raise ValueError, "cannot place prior across partial ties" + tie_matches = [tie for tie in self.tied_indices if set(which) == set(tie) ] + if len(tie_matches) > 1: + raise ValueError, "cannot place prior across multiple ties" + elif len(tie_matches) == 1: + which = which[:1] # just place a prior object on the first parameter + + + # check constraints are okay + + if what.domain is POSITIVE: + constrained_positive_indices = [i for i, t in zip(self.constrained_indices, self.constraints) if t.domain is POSITIVE] + if len(constrained_positive_indices): + constrained_positive_indices = np.hstack(constrained_positive_indices) + else: + constrained_positive_indices = np.zeros(shape=(0,)) + bad_constraints = np.setdiff1d(self.all_constrained_indices(), constrained_positive_indices) + assert not np.any(which[:, None] == bad_constraints), "constraint and prior incompatible" + unconst = np.setdiff1d(which, constrained_positive_indices) + if len(unconst): + print "Warning: constraining parameters to be positive:" + print '\n'.join([n for i, n in enumerate(self._get_param_names()) if i in unconst]) + print '\n' + self.constrain_positive(unconst) + elif what.domain is REAL: + assert not np.any(which[:, None] == self.all_constrained_indices()), "constraint and prior incompatible" + else: + raise ValueError, "prior not recognised" + + # store the prior in a local list + for w in which: + self.priors[w] = what + + def get_gradient(self, name, return_names=False): + """ + Get model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned. + + :param name: the name of parameters required (as a regular expression). + :type name: regular expression + :param return_names: whether or not to return the names matched (default False) + :type return_names: bool + """ + matches = self.grep_param_names(name) + if len(matches): + if return_names: + return self._log_likelihood_gradients()[matches], np.asarray(self._get_param_names())[matches].tolist() + else: + return self._log_likelihood_gradients()[matches] + else: + raise AttributeError, "no parameter matches %s" % name + + def log_prior(self): + """evaluate the prior""" + if self.priors is not None: + return np.sum([p.lnpdf(x) for p, x in zip(self.priors, self._get_params()) if p is not None]) + else: + return 0. + + def _log_prior_gradients(self): + """evaluate the gradients of the priors""" + if self.priors is None: + return 0. + x = self._get_params() + ret = np.zeros(x.size) + [np.put(ret, i, p.lnpdf_grad(xx)) for i, (p, xx) in enumerate(zip(self.priors, x)) if not p is None] + return ret + + def _transform_gradients(self, g): + x = self._get_params() + for index, constraint in zip(self.constrained_indices, self.constraints): + g[index] = g[index] * constraint.gradfactor(x[index]) + [np.put(g, i, v) for i, v in [(t[0], np.sum(g[t])) for t in self.tied_indices]] + if len(self.tied_indices) or len(self.fixed_indices): + to_remove = np.hstack((self.fixed_indices + [t[1:] for t in self.tied_indices])) + return np.delete(g, to_remove) + else: + return g + + def randomize(self): + """ + Randomize the model. + Make this draw from the prior if one exists, else draw from N(0,1) + """ + # first take care of all parameters (from N(0,1)) + x = self._get_params_transformed() + x = np.random.randn(x.size) + self._set_params_transformed(x) + # now draw from prior where possible + x = self._get_params() + if self.priors is not None: + [np.put(x, i, p.rvs(1)) for i, p in enumerate(self.priors) if not p is None] + self._set_params(x) + self._set_params_transformed(self._get_params_transformed()) # makes sure all of the tied parameters get the same init (since there's only one prior object...) + + + def optimize_restarts(self, num_restarts=10, robust=False, verbose=True, parallel=False, num_processes=None, **kwargs): + """ + Perform random restarts of the model, and set the model to the best + seen solution. + + If the robust flag is set, exceptions raised during optimizations will + be handled silently. If _all_ runs fail, the model is reset to the + existing parameter values. + + Notes + ----- + :param num_restarts: number of restarts to use (default 10) + :type num_restarts: int + :param robust: whether to handle exceptions silently or not (default False) + :type robust: bool + :param parallel: whether to run each restart as a separate process. It relies on the multiprocessing module. + :type parallel: bool + :param num_processes: number of workers in the multiprocessing pool + :type numprocesses: int + **kwargs are passed to the optimizer. They can be: + :param max_f_eval: maximum number of function evaluations + :type max_f_eval: int + :param max_iters: maximum number of iterations + :type max_iters: int + :param messages: whether to display during optimisation + :type messages: bool + + ..Note: If num_processes is None, the number of workes in the multiprocessing pool is automatically + set to the number of processors on the current machine. + + + """ + initial_parameters = self._get_params_transformed() + + if parallel: + try: + jobs = [] + pool = mp.Pool(processes=num_processes) + for i in range(num_restarts): + self.randomize() + job = pool.apply_async(opt_wrapper, args=(self,), kwds=kwargs) + jobs.append(job) + + pool.close() # signal that no more data coming in + pool.join() # wait for all the tasks to complete + except KeyboardInterrupt: + print "Ctrl+c received, terminating and joining pool." + pool.terminate() + pool.join() + + for i in range(num_restarts): + try: + if not parallel: + self.randomize() + self.optimize(**kwargs) + else: + self.optimization_runs.append(jobs[i].get()) + + if verbose: + print("Optimization restart {0}/{1}, f = {2}".format(i + 1, num_restarts, self.optimization_runs[-1].f_opt)) + except Exception as e: + if robust: + print("Warning - optimization restart {0}/{1} failed".format(i + 1, num_restarts)) + else: + raise e + + if len(self.optimization_runs): + i = np.argmin([o.f_opt for o in self.optimization_runs]) + self._set_params_transformed(self.optimization_runs[i].x_opt) + else: + self._set_params_transformed(initial_parameters) + + def ensure_default_constraints(self): + """ + Ensure that any variables which should clearly be positive + have been constrained somehow. The method performs a regular + expression search on parameter names looking for the terms + 'variance', 'lengthscale', 'precision' and 'kappa'. If any of + these terms are present in the name the parameter is + constrained positive. + """ + positive_strings = ['variance', 'lengthscale', 'precision', 'kappa'] + # param_names = self._get_param_names() + currently_constrained = self.all_constrained_indices() + to_make_positive = [] + for s in positive_strings: + for i in self.grep_param_names(".*" + s): + if not (i in currently_constrained): + to_make_positive.append(i) + if len(to_make_positive): + self.constrain_positive(np.asarray(to_make_positive)) + + def objective_function(self, x): + """ + The objective function passed to the optimizer. It combines + the likelihood and the priors. + + Failures are handled robustly. The algorithm will try several times to + return the objective, and will raise the original exception if it + the objective cannot be computed. + + :param x: the parameters of the model. + :parameter type: np.array + """ + try: + self._set_params_transformed(x) + self._fail_count = 0 + except (LinAlgError, ZeroDivisionError, ValueError) as e: + if self._fail_count >= self._allowed_failures: + raise e + self._fail_count += 1 + return np.inf + return -self.log_likelihood() - self.log_prior() + + def objective_function_gradients(self, x): + """ + Gets the gradients from the likelihood and the priors. + + Failures are handled robustly. The algorithm will try several times to + return the gradients, and will raise the original exception if it + the objective cannot be computed. + + :param x: the parameters of the model. + :parameter type: np.array + """ + try: + self._set_params_transformed(x) + obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()) + self._fail_count = 0 + except (LinAlgError, ZeroDivisionError, ValueError) as e: + if self._fail_count >= self._allowed_failures: + raise e + self._fail_count += 1 + obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100) + return obj_grads + + def objective_and_gradients(self, x): + """ + Compute the objective function of the model and the gradient of the model at the point given by x. + + :param x: the point at which gradients are to be computed. + :type np.array: + """ + + try: + self._set_params_transformed(x) + obj_f = -self.log_likelihood() - self.log_prior() + self._fail_count = 0 + obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()) + except (LinAlgError, ZeroDivisionError, ValueError) as e: + if self._fail_count >= self._allowed_failures: + raise e + self._fail_count += 1 + obj_f = np.inf + obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100) + return obj_f, obj_grads + + def optimize(self, optimizer=None, start=None, **kwargs): + """ + Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors. + kwargs are passed to the optimizer. They can be: + + :param max_f_eval: maximum number of function evaluations + :type max_f_eval: int + :messages: whether to display during optimisation + :type messages: bool + :param optimzer: which optimizer to use (defaults to self.preferred optimizer) + :type optimzer: string TODO: valid strings? + """ + if optimizer is None: + optimizer = self.preferred_optimizer + + if start == None: + start = self._get_params_transformed() + + optimizer = optimization.get_optimizer(optimizer) + opt = optimizer(start, model=self, **kwargs) + + opt.run(f_fp=self.objective_and_gradients, f=self.objective_function, fp=self.objective_function_gradients) + + self.optimization_runs.append(opt) + + self._set_params_transformed(opt.x_opt) + + def optimize_SGD(self, momentum=0.1, learning_rate=0.01, iterations=20, **kwargs): + # assert self.Y.shape[1] > 1, "SGD only works with D > 1" + sgd = SGD.StochasticGD(self, iterations, learning_rate, momentum, **kwargs) # @UndefinedVariable + sgd.run() + self.optimization_runs.append(sgd) + + def Laplace_covariance(self): + """return the covariance matrix of a Laplace approximation at the current (stationary) point.""" + # TODO add in the prior contributions for MAP estimation + # TODO fix the hessian for tied, constrained and fixed components + if hasattr(self, 'log_likelihood_hessian'): + A = -self.log_likelihood_hessian() + + else: + print "numerically calculating Hessian. please be patient!" + x = self._get_params() + def f(x): + self._set_params(x) + return self.log_likelihood() + h = ndt.Hessian(f) # @UndefinedVariable + A = -h(x) + self._set_params(x) + # check for almost zero components on the diagonal which screw up the cholesky + aa = np.nonzero((np.diag(A) < 1e-6) & (np.diag(A) > 0.))[0] + A[aa, aa] = 0. + return A + + def Laplace_evidence(self): + """Returns an estiamte of the model evidence based on the Laplace approximation. + Uses a numerical estimate of the Hessian if none is available analytically.""" + A = self.Laplace_covariance() + try: + hld = np.sum(np.log(np.diag(jitchol(A)[0]))) + except: + return np.nan + return 0.5 * self._get_params().size * np.log(2 * np.pi) + self.log_likelihood() - hld + + def __str__(self, names=None): + if names is None: + names = self._get_print_names() + s = Parameterized.__str__(self, names=names).split('\n') + # add priors to the string + if self.priors is not None: + strs = [str(p) if p is not None else '' for p in self.priors] + else: + strs = [''] * len(self._get_param_names()) + name_indices = self.grep_param_names("|".join(names)) + strs = np.array(strs)[name_indices] + width = np.array(max([len(p) for p in strs] + [5])) + 4 + + log_like = self.log_likelihood() + log_prior = self.log_prior() + obj_funct = '\nLog-likelihood: {0:.3e}'.format(log_like) + if len(''.join(strs)) != 0: + obj_funct += ', Log prior: {0:.3e}, LL+prior = {0:.3e}'.format(log_prior, log_like + log_prior) + obj_funct += '\n\n' + s[0] = obj_funct + s[0] + s[0] += "|{h:^{col}}".format(h='prior', col=width) + s[1] += '-' * (width + 1) + + for p in range(2, len(strs) + 2): + s[p] += '|{prior:^{width}}'.format(prior=strs[p - 2], width=width) + + return '\n'.join(s) + + + def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3): + """ + Check the gradient of the ,odel by comparing to a numerical + estimate. If the verbose flag is passed, invividual + components are tested (and printed) + + :param verbose: If True, print a "full" checking of each parameter + :type verbose: bool + :param step: The size of the step around which to linearise the objective + :type step: float (default 1e-6) + :param tolerance: the tolerance allowed (see note) + :type tolerance: float (default 1e-3) + + Note:- + The gradient is considered correct if the ratio of the analytical + and numerical gradients is within of unity. + """ + + x = self._get_params_transformed().copy() + + if not verbose: + # just check the global ratio + dx = step * np.sign(np.random.uniform(-1, 1, x.size)) + + # evaulate around the point x + f1, g1 = self.objective_and_gradients(x + dx) + f2, g2 = self.objective_and_gradients(x - dx) + gradient = self.objective_function_gradients(x) + + numerical_gradient = (f1 - f2) / (2 * dx) + global_ratio = (f1 - f2) / (2 * np.dot(dx, gradient)) + + return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() - 1) < tolerance + else: + # check the gradient of each parameter individually, and do some pretty printing + try: + names = self._get_param_names_transformed() + except NotImplementedError: + names = ['Variable %i' % i for i in range(len(x))] + + # Prepare for pretty-printing + header = ['Name', 'Ratio', 'Difference', 'Analytical', 'Numerical'] + max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])]) + float_len = 10 + cols = [max_names] + cols.extend([max(float_len, len(header[i])) for i in range(1, len(header))]) + cols = np.array(cols) + 5 + header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))] + header_string = map(lambda x: '|'.join(x), [header_string]) + separator = '-' * len(header_string[0]) + print '\n'.join([header_string[0], separator]) + + if target_param is None: + param_list = range(len(x)) + else: + param_list = self.grep_param_names(target_param, transformed=True, search=True) + if not np.any(param_list): + print "No free parameters to check" + return + + + for i in param_list: + xx = x.copy() + xx[i] += step + f1, g1 = self.objective_and_gradients(xx) + xx[i] -= 2.*step + f2, g2 = self.objective_and_gradients(xx) + gradient = self.objective_function_gradients(x)[i] + + numerical_gradient = (f1 - f2) / (2 * step) + ratio = (f1 - f2) / (2 * step * gradient) + difference = np.abs((f1 - f2) / 2 / step - gradient) + + if (np.abs(1. - ratio) < tolerance) or np.abs(difference) < tolerance: + formatted_name = "\033[92m {0} \033[0m".format(names[i]) + else: + formatted_name = "\033[91m {0} \033[0m".format(names[i]) + r = '%.6f' % float(ratio) + d = '%.6f' % float(difference) + g = '%.6f' % gradient + ng = '%.6f' % float(numerical_gradient) + grad_string = "{0:^{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}".format(formatted_name, r, d, g, ng, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4]) + print grad_string + + def input_sensitivity(self): + """ + return an array describing the sesitivity of the model to each input + + NB. Right now, we're basing this on the lengthscales (or + variances) of the kernel. TODO: proper sensitivity analysis + where we integrate across the model inputs and evaluate the + effect on the variance of the model output. """ + + if not hasattr(self, 'kern'): + raise ValueError, "this model has no kernel" + + k = [p for p in self.kern.parts if p.name in ['rbf', 'linear', 'rbf_inv']] + if (not len(k) == 1) or (not k[0].ARD): + raise ValueError, "cannot determine sensitivity for this kernel" + k = k[0] + + if k.name == 'rbf': + return 1. / k.lengthscale + elif k.name == 'rbf_inv': + return k.inv_lengthscale + elif k.name == 'linear': + return k.variances + + + def pseudo_EM(self, epsilon=.1, **kwargs): + """ + TODO: Should this not bein the GP class? + EM - like algorithm for Expectation Propagation and Laplace approximation + + kwargs are passed to the optimize function. They can be: + + :epsilon: convergence criterion + :max_f_eval: maximum number of function evaluations + :messages: whether to display during optimisation + :param optimzer: whice optimizer to use (defaults to self.preferred optimizer) + :type optimzer: string TODO: valid strings? + + """ + assert isinstance(self.likelihood, likelihoods.EP), "pseudo_EM is only available for EP likelihoods" + ll_change = epsilon + 1. + iteration = 0 + last_ll = -np.inf + + convergence = False + alpha = 0 + stop = False + + while not stop: + last_approximation = self.likelihood.copy() + last_params = self._get_params() + self.update_likelihood_approximation() + new_ll = self.log_likelihood() + ll_change = new_ll - last_ll + + if ll_change < 0: + self.likelihood = last_approximation # restore previous likelihood approximation + self._set_params(last_params) # restore model parameters + print "Log-likelihood decrement: %s \nLast likelihood update discarded." % ll_change + stop = True + else: + self.optimize(**kwargs) + last_ll = self.log_likelihood() + if ll_change < epsilon: + stop = True + iteration += 1 + if stop: + print "%s iterations." % iteration + self.update_likelihood_approximation() diff --git a/GPy/core/parameterized.py b/GPy/core/parameterized.py new file mode 100644 index 00000000..cad4d2a9 --- /dev/null +++ b/GPy/core/parameterized.py @@ -0,0 +1,385 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +import re +import copy +import cPickle +import warnings +import transformations + +class Parameterized(object): + def __init__(self): + """ + This is the base class for model and kernel. Mostly just handles tieing and constraining of parameters + """ + self.tied_indices = [] + self.fixed_indices = [] + self.fixed_values = [] + self.constrained_indices = [] + self.constraints = [] + + def _get_params(self): + raise NotImplementedError, "this needs to be implemented to use the Parameterized class" + def _set_params(self, x): + raise NotImplementedError, "this needs to be implemented to use the Parameterized class" + + def _get_param_names(self): + raise NotImplementedError, "this needs to be implemented to use the Parameterized class" + def _get_print_names(self): + """ Override for which names to print out, when using print m """ + return self._get_param_names() + + def pickle(self, filename, protocol=None): + if protocol is None: + if self._has_get_set_state(): + protocol = 0 + else: + protocol = -1 + with open(filename, 'w') as f: + cPickle.dump(self, f, protocol) + + def copy(self): + """Returns a (deep) copy of the current model """ + return copy.deepcopy(self) + + def __getstate__(self): + if self._has_get_set_state(): + return self.getstate() + return self.__dict__ + + def __setstate__(self, state): + if self._has_get_set_state(): + self.setstate(state) # set state + self._set_params(self._get_params()) # restore all values + return + self.__dict__ = state + + def _has_get_set_state(self): + return 'getstate' in vars(self.__class__) and 'setstate' in vars(self.__class__) + + def getstate(self): + """ + Get the current state of the class, + here just all the indices, rest can get recomputed + + For inheriting from Parameterized: + Allways append the state of the inherited object + and call down to the inherited object in setstate!! + """ + return [self.tied_indices, + self.fixed_indices, + self.fixed_values, + self.constrained_indices, + self.constraints] + + def setstate(self, state): + self.constraints = state.pop() + self.constrained_indices = state.pop() + self.fixed_values = state.pop() + self.fixed_indices = state.pop() + self.tied_indices = state.pop() + + def __getitem__(self, regexp, return_names=False): + """ + Get a model parameter by name. The name is applied as a regular + expression and all parameters that match that regular expression are + returned. + """ + matches = self.grep_param_names(regexp) + if len(matches): + if return_names: + return self._get_params()[matches], np.asarray(self._get_param_names())[matches].tolist() + else: + return self._get_params()[matches] + else: + raise AttributeError, "no parameter matches %s" % regexp + + def __setitem__(self, name, val): + """ + Set model parameter(s) by name. The name is provided as a regular + expression. All parameters matching that regular expression are set to + the given value. + """ + matches = self.grep_param_names(name) + if len(matches): + val = np.array(val) + assert (val.size == 1) or val.size == len(matches), "Shape mismatch: {}:({},)".format(val.size, len(matches)) + x = self._get_params() + x[matches] = val + self._set_params(x) + else: + raise AttributeError, "no parameter matches %s" % name + + def tie_params(self, regexp): + """ + Tie (all!) parameters matching the regular expression `regexp`. + """ + matches = self.grep_param_names(regexp) + assert matches.size > 0, "need at least something to tie together" + if len(self.tied_indices): + assert not np.any(matches[:, None] == np.hstack(self.tied_indices)), "Some indices are already tied!" + self.tied_indices.append(matches) + # TODO only one of the priors will be evaluated. Give a warning message if the priors are not identical + if hasattr(self, 'prior'): + pass + + self._set_params_transformed(self._get_params_transformed()) # sets tied parameters to single value + + def untie_everything(self): + """Unties all parameters by setting tied_indices to an empty list.""" + self.tied_indices = [] + + def grep_param_names(self, regexp, transformed=False, search=False): + """ + :param regexp: regular expression to select parameter names + :type regexp: re | str | int + :rtype: the indices of self._get_param_names which match the regular expression. + + Note:- + Other objects are passed through - i.e. integers which weren't meant for grepping + """ + + if transformed: + names = self._get_param_names_transformed() + else: + names = self._get_param_names() + + if type(regexp) in [str, np.string_, np.str]: + regexp = re.compile(regexp) + elif type(regexp) is re._pattern_type: + pass + else: + return regexp + if search: + return np.nonzero([regexp.search(name) for name in names])[0] + else: + return np.nonzero([regexp.match(name) for name in names])[0] + + def num_params_transformed(self): + removed = 0 + for tie in self.tied_indices: + removed += tie.size - 1 + + for fix in self.fixed_indices: + removed += fix.size + + return len(self._get_params()) - removed + + def unconstrain(self, regexp): + """Unconstrain matching parameters. Does not untie parameters""" + matches = self.grep_param_names(regexp) + + # tranformed contraints: + for match in matches: + self.constrained_indices = [i[i <> match] for i in self.constrained_indices] + + # remove empty constraints + tmp = zip(*[(i, t) for i, t in zip(self.constrained_indices, self.constraints) if len(i)]) + if tmp: + self.constrained_indices, self.constraints = zip(*[(i, t) for i, t in zip(self.constrained_indices, self.constraints) if len(i)]) + self.constrained_indices, self.constraints = list(self.constrained_indices), list(self.constraints) + + # fixed: + self.fixed_values = [np.delete(values, np.nonzero(np.sum(indices[:, None] == matches[None, :], 1))[0]) for indices, values in zip(self.fixed_indices, self.fixed_values)] + self.fixed_indices = [np.delete(indices, np.nonzero(np.sum(indices[:, None] == matches[None, :], 1))[0]) for indices in self.fixed_indices] + + # remove empty elements + tmp = [(i, v) for i, v in zip(self.fixed_indices, self.fixed_values) if len(i)] + if tmp: + self.fixed_indices, self.fixed_values = zip(*tmp) + self.fixed_indices, self.fixed_values = list(self.fixed_indices), list(self.fixed_values) + else: + self.fixed_indices, self.fixed_values = [], [] + + def constrain_negative(self, regexp): + """ Set negative constraints. """ + self.constrain(regexp, transformations.negative_logexp()) + + def constrain_positive(self, regexp): + """ Set positive constraints. """ + self.constrain(regexp, transformations.logexp()) + + def constrain_bounded(self, regexp, lower, upper): + """ Set bounded constraints. """ + self.constrain(regexp, transformations.logistic(lower, upper)) + + def all_constrained_indices(self): + if len(self.constrained_indices) or len(self.fixed_indices): + return np.hstack(self.constrained_indices + self.fixed_indices) + else: + return np.empty(shape=(0,)) + + def constrain(self, regexp, transform): + assert isinstance(transform, transformations.transformation) + + matches = self.grep_param_names(regexp) + overlap = set(matches).intersection(set(self.all_constrained_indices())) + if overlap: + self.unconstrain(np.asarray(list(overlap))) + print 'Warning: re-constraining these parameters' + pn = self._get_param_names() + for i in overlap: + print pn[i] + + self.constrained_indices.append(matches) + self.constraints.append(transform) + x = self._get_params() + x[matches] = transform.initialize(x[matches]) + self._set_params(x) + + def constrain_fixed(self, regexp, value=None): + """ + Arguments + --------- + :param regexp: which parameters need to be fixed. + :type regexp: ndarray(dtype=int) or regular expression object or string + :param value: the vlaue to fix the parameters to. If the value is not specified, + the parameter is fixed to the current value + :type value: float + Notes + ----- + Fixing a parameter which is tied to another, or constrained in some way will result in an error. + To fix multiple parameters to the same value, simply pass a regular expression which matches both parameter names, or pass both of the indexes + """ + matches = self.grep_param_names(regexp) + overlap = set(matches).intersection(set(self.all_constrained_indices())) + if overlap: + self.unconstrain(np.asarray(list(overlap))) + print 'Warning: re-constraining these parameters' + pn = self._get_param_names() + for i in overlap: + print pn[i] + + self.fixed_indices.append(matches) + if value != None: + self.fixed_values.append(value) + else: + self.fixed_values.append(self._get_params()[self.fixed_indices[-1]]) + + # self.fixed_values.append(value) + self._set_params_transformed(self._get_params_transformed()) + + def _get_params_transformed(self): + """use self._get_params to get the 'true' parameters of the model, which are then tied, constrained and fixed""" + x = self._get_params() + [np.put(x, i, t.finv(x[i])) for i, t in zip(self.constrained_indices, self.constraints)] + + to_remove = self.fixed_indices + [t[1:] for t in self.tied_indices] + if len(to_remove): + return np.delete(x, np.hstack(to_remove)) + else: + return x + + def _set_params_transformed(self, x): + """ takes the vector x, which is then modified (by untying, reparameterising or inserting fixed values), and then call self._set_params""" + self._set_params(self._untransform_params(x)) + + def _untransform_params(self, x): + """ + The transformation required for _set_params_transformed. + + This moves the vector x seen by the optimiser (unconstrained) to the + valid parameter vector seen by the model + + Note: + - This function is separate from _set_params_transformed for downstream flexibility + """ + # work out how many places are fixed, and where they are. tricky logic! + fix_places = self.fixed_indices + [t[1:] for t in self.tied_indices] + if len(fix_places): + fix_places = np.hstack(fix_places) + Nfix_places = fix_places.size + else: + Nfix_places = 0 + + free_places = np.setdiff1d(np.arange(Nfix_places + x.size, dtype=np.int), fix_places) + + # put the models values in the vector xx + xx = np.zeros(Nfix_places + free_places.size, dtype=np.float64) + + xx[free_places] = x + [np.put(xx, i, v) for i, v in zip(self.fixed_indices, self.fixed_values)] + [np.put(xx, i, v) for i, v in [(t[1:], xx[t[0]]) for t in self.tied_indices] ] + + [np.put(xx, i, t.f(xx[i])) for i, t in zip(self.constrained_indices, self.constraints)] + if hasattr(self, 'debug'): + stop # @UndefinedVariable + + return xx + + def _get_param_names_transformed(self): + """ + Returns the parameter names as propagated after constraining, + tying or fixing, i.e. a list of the same length as _get_params_transformed() + """ + n = self._get_param_names() + + # remove/concatenate the tied parameter names + if len(self.tied_indices): + for t in self.tied_indices: + n[t[0]] = "".join([n[tt] for tt in t]) + remove = np.hstack([t[1:] for t in self.tied_indices]) + else: + remove = np.empty(shape=(0,), dtype=np.int) + + # also remove the fixed params + if len(self.fixed_indices): + remove = np.hstack((remove, np.hstack(self.fixed_indices))) + + # add markers to show that some variables are constrained + for i, t in zip(self.constrained_indices, self.constraints): + for ii in i: + n[ii] = n[ii] + t.__str__() + + n = [nn for i, nn in enumerate(n) if not i in remove] + return n + + @property + def all(self): + return self.__str__(self._get_param_names()) + + + def __str__(self, names=None, nw=30): + """ + Return a string describing the parameter names and their ties and constraints + """ + if names is None: + names = self._get_print_names() + name_indices = self.grep_param_names("|".join(names)) + N = len(names) + + if not N: + return "This object has no free parameters." + header = ['Name', 'Value', 'Constraints', 'Ties'] + values = self._get_params()[name_indices] # map(str,self._get_params()) + # sort out the constraints + constraints = [''] * len(self._get_param_names()) + for i, t in zip(self.constrained_indices, self.constraints): + for ii in i: + constraints[ii] = t.__str__() + for i in self.fixed_indices: + for ii in i: + constraints[ii] = 'Fixed' + # sort out the ties + ties = [''] * len(names) + for i, tie in enumerate(self.tied_indices): + for j in tie: + ties[j] = '(' + str(i) + ')' + + values = ['%.4f' % float(v) for v in values] + max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])]) + max_values = max([len(values[i]) for i in range(len(values))] + [len(header[1])]) + max_constraint = max([len(constraints[i]) for i in range(len(constraints))] + [len(header[2])]) + max_ties = max([len(ties[i]) for i in range(len(ties))] + [len(header[3])]) + cols = np.array([max_names, max_values, max_constraint, max_ties]) + 4 + # columns = cols.sum() + + header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))] + header_string = map(lambda x: '|'.join(x), [header_string]) + separator = '-' * len(header_string[0]) + param_string = ["{n:^{c0}}|{v:^{c1}}|{c:^{c2}}|{t:^{c3}}".format(n=names[i], v=values[i], c=constraints[i], t=ties[i], c0=cols[0], c1=cols[1], c2=cols[2], c3=cols[3]) for i in range(len(values))] + + + return ('\n'.join([header_string[0], separator] + param_string)) + '\n' diff --git a/GPy/core/priors.py b/GPy/core/priors.py new file mode 100644 index 00000000..43090ae3 --- /dev/null +++ b/GPy/core/priors.py @@ -0,0 +1,217 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +import pylab as pb +from scipy.special import gammaln, digamma +from ..util.linalg import pdinv +from GPy.core.domains import REAL, POSITIVE +import warnings + +class Prior: + domain = None + def pdf(self, x): + return np.exp(self.lnpdf(x)) + + def plot(self): + rvs = self.rvs(1000) + pb.hist(rvs, 100, normed=True) + xmin, xmax = pb.xlim() + xx = np.linspace(xmin, xmax, 1000) + pb.plot(xx, self.pdf(xx), 'r', linewidth=2) + + +class Gaussian(Prior): + """ + Implementation of the univariate Gaussian probability function, coupled with random variables. + + :param mu: mean + :param sigma: standard deviation + + .. Note:: Bishop 2006 notation is used throughout the code + + """ + domain = REAL + def __init__(self, mu, sigma): + self.mu = float(mu) + self.sigma = float(sigma) + self.sigma2 = np.square(self.sigma) + self.constant = -0.5 * np.log(2 * np.pi * self.sigma2) + + def __str__(self): + return "N(" + str(np.round(self.mu)) + ', ' + str(np.round(self.sigma2)) + ')' + + def lnpdf(self, x): + return self.constant - 0.5 * np.square(x - self.mu) / self.sigma2 + + def lnpdf_grad(self, x): + return -(x - self.mu) / self.sigma2 + + def rvs(self, n): + return np.random.randn(n) * self.sigma + self.mu + + +class LogGaussian(Prior): + """ + Implementation of the univariate *log*-Gaussian probability function, coupled with random variables. + + :param mu: mean + :param sigma: standard deviation + + .. Note:: Bishop 2006 notation is used throughout the code + + """ + domain = POSITIVE + def __init__(self, mu, sigma): + self.mu = float(mu) + self.sigma = float(sigma) + self.sigma2 = np.square(self.sigma) + self.constant = -0.5 * np.log(2 * np.pi * self.sigma2) + + def __str__(self): + return "lnN(" + str(np.round(self.mu)) + ', ' + str(np.round(self.sigma2)) + ')' + + def lnpdf(self, x): + return self.constant - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2 - np.log(x) + + def lnpdf_grad(self, x): + return -((np.log(x) - self.mu) / self.sigma2 + 1.) / x + + def rvs(self, n): + return np.exp(np.random.randn(n) * self.sigma + self.mu) + + +class MultivariateGaussian: + """ + Implementation of the multivariate Gaussian probability function, coupled with random variables. + + :param mu: mean (N-dimensional array) + :param var: covariance matrix (NxN) + + .. Note:: Bishop 2006 notation is used throughout the code + + """ + domain = REAL + def __init__(self, mu, var): + self.mu = np.array(mu).flatten() + self.var = np.array(var) + assert len(self.var.shape) == 2 + assert self.var.shape[0] == self.var.shape[1] + assert self.var.shape[0] == self.mu.size + self.input_dim = self.mu.size + self.inv, self.hld = pdinv(self.var) + self.constant = -0.5 * self.input_dim * np.log(2 * np.pi) - self.hld + + def summary(self): + raise NotImplementedError + + def pdf(self, x): + return np.exp(self.lnpdf(x)) + + def lnpdf(self, x): + d = x - self.mu + return self.constant - 0.5 * np.sum(d * np.dot(d, self.inv), 1) + + def lnpdf_grad(self, x): + d = x - self.mu + return -np.dot(self.inv, d) + + def rvs(self, n): + return np.random.multivariate_normal(self.mu, self.var, n) + + def plot(self): + if self.input_dim == 2: + rvs = self.rvs(200) + pb.plot(rvs[:, 0], rvs[:, 1], 'kx', mew=1.5) + xmin, xmax = pb.xlim() + ymin, ymax = pb.ylim() + xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] + xflat = np.vstack((xx.flatten(), yy.flatten())).T + zz = self.pdf(xflat).reshape(100, 100) + pb.contour(xx, yy, zz, linewidths=2) + + +def gamma_from_EV(E, V): + warnings.warn("use Gamma.from_EV to create Gamma Prior", FutureWarning) + return Gamma.from_EV(E, V) + + +class Gamma(Prior): + """ + Implementation of the Gamma probability function, coupled with random variables. + + :param a: shape parameter + :param b: rate parameter (warning: it's the *inverse* of the scale) + + .. Note:: Bishop 2006 notation is used throughout the code + + """ + domain = POSITIVE + def __init__(self, a, b): + self.a = float(a) + self.b = float(b) + self.constant = -gammaln(self.a) + a * np.log(b) + + def __str__(self): + return "Ga(" + str(np.round(self.a)) + ', ' + str(np.round(self.b)) + ')' + + def summary(self): + ret = {"E[x]": self.a / self.b, \ + "E[ln x]": digamma(self.a) - np.log(self.b), \ + "var[x]": self.a / self.b / self.b, \ + "Entropy": gammaln(self.a) - (self.a - 1.) * digamma(self.a) - np.log(self.b) + self.a} + if self.a > 1: + ret['Mode'] = (self.a - 1.) / self.b + else: + ret['mode'] = np.nan + return ret + + def lnpdf(self, x): + return self.constant + (self.a - 1) * np.log(x) - self.b * x + + def lnpdf_grad(self, x): + return (self.a - 1.) / x - self.b + + def rvs(self, n): + return np.random.gamma(scale=1. / self.b, shape=self.a, size=n) + @staticmethod + def from_EV(E, V): + """ + Creates an instance of a Gamma Prior by specifying the Expected value(s) + and Variance(s) of the distribution. + + :param E: expected value + :param V: variance + """ + a = np.square(E) / V + b = E / V + return Gamma(a, b) + +class inverse_gamma(Prior): + """ + Implementation of the inverse-Gamma probability function, coupled with random variables. + + :param a: shape parameter + :param b: rate parameter (warning: it's the *inverse* of the scale) + + .. Note:: Bishop 2006 notation is used throughout the code + + """ + domain = POSITIVE + def __init__(self, a, b): + self.a = float(a) + self.b = float(b) + self.constant = -gammaln(self.a) + a * np.log(b) + + def __str__(self): + return "iGa(" + str(np.round(self.a)) + ', ' + str(np.round(self.b)) + ')' + + def lnpdf(self, x): + return self.constant - (self.a + 1) * np.log(x) - self.b / x + + def lnpdf_grad(self, x): + return -(self.a + 1.) / x + self.b / x ** 2 + + def rvs(self, n): + return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n) diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py new file mode 100644 index 00000000..40cfd404 --- /dev/null +++ b/GPy/core/sparse_gp.py @@ -0,0 +1,347 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +import pylab as pb +from ..util.linalg import mdot, jitchol, tdot, symmetrify, backsub_both_sides, chol_inv, dtrtrs, dpotrs, dpotri +from scipy import linalg +from ..likelihoods import Gaussian +from gp_base import GPBase + +class SparseGP(GPBase): + """ + Variational sparse GP model + + :param X: inputs + :type X: np.ndarray (num_data x input_dim) + :param likelihood: a likelihood instance, containing the observed data + :type likelihood: GPy.likelihood.(Gaussian | EP | Laplace) + :param kernel : the kernel (covariance function). See link kernels + :type kernel: a GPy.kern.kern instance + :param X_variance: The uncertainty in the measurements of X (Gaussian variance) + :type X_variance: np.ndarray (num_data x input_dim) | None + :param Z: inducing inputs (optional, see note) + :type Z: np.ndarray (num_inducing x input_dim) | None + :param num_inducing : Number of inducing points (optional, default 10. Ignored if Z is not None) + :type num_inducing: int + :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) + :type normalize_(X|Y): bool + """ + + def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False): + GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X) + + self.Z = Z + self.num_inducing = Z.shape[0] +# self.likelihood = likelihood + + if X_variance is None: + self.has_uncertain_inputs = False + self.X_variance = None + else: + assert X_variance.shape == X.shape + self.has_uncertain_inputs = True + self.X_variance = X_variance + + if normalize_X: + self.Z = (self.Z.copy() - self._Xoffset) / self._Xscale + + # normalize X uncertainty also + if self.has_uncertain_inputs: + self.X_variance /= np.square(self._Xscale) + + self._const_jitter = None + + def getstate(self): + """ + Get the current state of the class, + here just all the indices, rest can get recomputed + """ + return GPBase.getstate(self) + [self.Z, + self.num_inducing, + self.has_uncertain_inputs, + self.X_variance] + + def setstate(self, state): + self.X_variance = state.pop() + self.has_uncertain_inputs = state.pop() + self.num_inducing = state.pop() + self.Z = state.pop() + GPBase.setstate(self, state) + + def _compute_kernel_matrices(self): + # kernel computations, using BGPLVM notation + self.Kmm = self.kern.K(self.Z) + if self.has_uncertain_inputs: + self.psi0 = self.kern.psi0(self.Z, self.X, self.X_variance) + self.psi1 = self.kern.psi1(self.Z, self.X, self.X_variance) + self.psi2 = self.kern.psi2(self.Z, self.X, self.X_variance) + else: + self.psi0 = self.kern.Kdiag(self.X) + self.psi1 = self.kern.K(self.X, self.Z) + self.psi2 = None + + def _computations(self): + if self._const_jitter is None or not(self._const_jitter.shape[0] == self.num_inducing): + self._const_jitter = np.eye(self.num_inducing) * 1e-7 + + # factor Kmm + self._Lm = jitchol(self.Kmm + self._const_jitter) + # TODO: no white kernel needed anymore, all noise in likelihood -------- + + # The rather complex computations of self._A + if self.has_uncertain_inputs: + if self.likelihood.is_heteroscedastic: + psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.num_data, 1, 1))).sum(0) + else: + psi2_beta = self.psi2.sum(0) * self.likelihood.precision + evals, evecs = linalg.eigh(psi2_beta) + clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable + if not np.array_equal(evals, clipped_evals): + pass # print evals + tmp = evecs * np.sqrt(clipped_evals) + tmp = tmp.T + else: + if self.likelihood.is_heteroscedastic: + tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(self.num_data, 1))) + else: + tmp = self.psi1 * (np.sqrt(self.likelihood.precision)) + tmp, _ = dtrtrs(self._Lm, np.asfortranarray(tmp.T), lower=1) + self._A = tdot(tmp) + + + # factor B + self.B = np.eye(self.num_inducing) + self._A + self.LB = jitchol(self.B) + + # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! + self.psi1Vf = np.dot(self.psi1.T, self.likelihood.VVT_factor) + + # back substutue C into psi1Vf + tmp, info1 = dtrtrs(self._Lm, np.asfortranarray(self.psi1Vf), lower=1, trans=0) + self._LBi_Lmi_psi1Vf, _ = dtrtrs(self.LB, np.asfortranarray(tmp), lower=1, trans=0) + # tmp, info2 = dpotrs(self.LB, tmp, lower=1) + tmp, info2 = dtrtrs(self.LB, self._LBi_Lmi_psi1Vf, lower=1, trans=1) + self.Cpsi1Vf, info3 = dtrtrs(self._Lm, tmp, lower=1, trans=1) + + # Compute dL_dKmm + tmp = tdot(self._LBi_Lmi_psi1Vf) + self.data_fit = np.trace(tmp) + self.DBi_plus_BiPBi = backsub_both_sides(self.LB, self.output_dim * np.eye(self.num_inducing) + tmp) + tmp = -0.5 * self.DBi_plus_BiPBi + tmp += -0.5 * self.B * self.output_dim + tmp += self.output_dim * np.eye(self.num_inducing) + self.dL_dKmm = backsub_both_sides(self._Lm, tmp) + + # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertain inputs case + self.dL_dpsi0 = -0.5 * self.output_dim * (self.likelihood.precision * np.ones([self.num_data, 1])).flatten() + self.dL_dpsi1 = np.dot(self.likelihood.VVT_factor, self.Cpsi1Vf.T) + dL_dpsi2_beta = 0.5 * backsub_both_sides(self._Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi) + + if self.likelihood.is_heteroscedastic: + if self.has_uncertain_inputs: + self.dL_dpsi2 = self.likelihood.precision.flatten()[:, None, None] * dL_dpsi2_beta[None, :, :] + else: + self.dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (self.psi1 * self.likelihood.precision.reshape(self.num_data, 1)).T).T + self.dL_dpsi2 = None + else: + dL_dpsi2 = self.likelihood.precision * dL_dpsi2_beta + if self.has_uncertain_inputs: + # repeat for each of the N psi_2 matrices + self.dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], self.num_data, axis=0) + else: + # subsume back into psi1 (==Kmn) + self.dL_dpsi1 += 2.*np.dot(self.psi1, dL_dpsi2) + self.dL_dpsi2 = None + + + # the partial derivative vector for the likelihood + if self.likelihood.Nparams == 0: + # save computation here. + self.partial_for_likelihood = None + elif self.likelihood.is_heteroscedastic: + raise NotImplementedError, "heteroscedatic derivates not implemented" + else: + # likelihood is not heterscedatic + self.partial_for_likelihood = -0.5 * self.num_data * self.output_dim * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2 + self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self._A) * self.likelihood.precision) + self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self._A * self.DBi_plus_BiPBi) - self.data_fit) + + def log_likelihood(self): + """ Compute the (lower bound on the) log marginal likelihood """ + if self.likelihood.is_heteroscedastic: + A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V * self.likelihood.Y) + B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self._A)) + else: + A = -0.5 * self.num_data * self.output_dim * (np.log(2.*np.pi) - np.log(self.likelihood.precision)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT + B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self._A)) + C = -self.output_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2)) + D = 0.5 * self.data_fit + return A + B + C + D + self.likelihood.Z + + def _set_params(self, p): + self.Z = p[:self.num_inducing * self.input_dim].reshape(self.num_inducing, self.input_dim) + self.kern._set_params(p[self.Z.size:self.Z.size + self.kern.num_params]) + self.likelihood._set_params(p[self.Z.size + self.kern.num_params:]) + self._compute_kernel_matrices() + self._computations() + self.Cpsi1V = None + + def _get_params(self): + return np.hstack([self.Z.flatten(), self.kern._get_params_transformed(), self.likelihood._get_params()]) + + def _get_param_names(self): + return sum([['iip_%i_%i' % (i, j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])], [])\ + + self.kern._get_param_names_transformed() + self.likelihood._get_param_names() + + def _get_print_names(self): + return self.kern._get_param_names_transformed() + self.likelihood._get_param_names() + + def update_likelihood_approximation(self): + """ + Approximates a non-gaussian likelihood using Expectation Propagation + + For a Gaussian likelihood, no iteration is required: + this function does nothing + """ + if not isinstance(self.likelihood, Gaussian): # Updates not needed for Gaussian likelihood + self.likelihood.restart() + if self.has_uncertain_inputs: + Lmi = chol_inv(self._Lm) + Kmmi = tdot(Lmi.T) + diag_tr_psi2Kmmi = np.array([np.trace(psi2_Kmmi) for psi2_Kmmi in np.dot(self.psi2, Kmmi)]) + + self.likelihood.fit_FITC(self.Kmm, self.psi1.T, diag_tr_psi2Kmmi) # This uses the fit_FITC code, but does not perfomr a FITC-EP.#TODO solve potential confusion + # raise NotImplementedError, "EP approximation not implemented for uncertain inputs" + else: + self.likelihood.fit_DTC(self.Kmm, self.psi1.T) + # self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0) + self._set_params(self._get_params()) # update the GP + + def _log_likelihood_gradients(self): + return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood))) + + def dL_dtheta(self): + """ + Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel + """ + dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z) + if self.has_uncertain_inputs: + dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X, self.X_variance) + dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X, self.X_variance) + dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X, self.X_variance) + else: + dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X, self.Z) + dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X) + + return dL_dtheta + + def dL_dZ(self): + """ + The derivative of the bound wrt the inducing inputs Z + """ + dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm, self.Z) # factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ + if self.has_uncertain_inputs: + dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance) + dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance) + else: + dL_dZ += self.kern.dK_dX(self.dL_dpsi1.T, self.Z, self.X) + return dL_dZ + + def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False): + """ + Internal helper function for making predictions, does not account for + normalization or likelihood function + """ + + Bi, _ = dpotri(self.LB, lower=0) # WTH? this lower switch should be 1, but that doesn't work! + symmetrify(Bi) + Kmmi_LmiBLmi = backsub_both_sides(self._Lm, np.eye(self.num_inducing) - Bi) + + if self.Cpsi1V is None: + psi1V = np.dot(self.psi1.T, self.likelihood.V) + tmp, _ = dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0) + tmp, _ = dpotrs(self.LB, tmp, lower=1) + self.Cpsi1V, _ = dtrtrs(self._Lm, tmp, lower=1, trans=1) + + if X_variance_new is None: + Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts) + mu = np.dot(Kx.T, self.Cpsi1V) + if full_cov: + Kxx = self.kern.K(Xnew, which_parts=which_parts) + var = Kxx - mdot(Kx.T, Kmmi_LmiBLmi, Kx) # NOTE this won't work for plotting + else: + Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts) + var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0) + else: + # assert which_p.Tarts=='all', "swithching out parts of variational kernels is not implemented" + Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts + mu = np.dot(Kx, self.Cpsi1V) + if full_cov: + raise NotImplementedError, "TODO" + else: + Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new) + psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new) + var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1) + + return mu, var[:, None] + + def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False): + """ + Predict the function(s) at the new point(s) Xnew. + + Arguments + --------- + :param Xnew: The points at which to make a prediction + :type Xnew: np.ndarray, Nnew x self.input_dim + :param X_variance_new: The uncertainty in the prediction points + :type X_variance_new: np.ndarray, Nnew x self.input_dim + :param which_parts: specifies which outputs kernel(s) to use in prediction + :type which_parts: ('all', list of bools) + :param full_cov: whether to return the folll covariance matrix, or just the diagonal + :type full_cov: bool + :rtype: posterior mean, a Numpy array, Nnew x self.input_dim + :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise + :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays, Nnew x self.input_dim + + + If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew. + This is to allow for different normalizations of the output dimensions. + + """ + # normalize X values + Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale + if X_variance_new is not None: + X_variance_new = X_variance_new / self._Xscale ** 2 + + # here's the actual prediction by the GP model + mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts) + + # now push through likelihood + mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov) + + return mean, var, _025pm, _975pm + + def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None): + + if ax is None: + fig = pb.figure(num=fignum) + ax = fig.add_subplot(111) + if which_data is 'all': + which_data = slice(None) + + GPBase.plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, ax=ax) + + # add the inducing inputs and some errorbars + if self.X.shape[1] == 1: + if self.has_uncertain_inputs: + Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now + ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0], + xerr=2 * np.sqrt(self.X_variance[which_data, 0]), + ecolor='k', fmt=None, elinewidth=.5, alpha=.5) + Zu = self.Z * self._Xscale + self._Xoffset + ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12) + + elif self.X.shape[1] == 2: + Zu = self.Z * self._Xscale + self._Xoffset + ax.plot(Zu[:, 0], Zu[:, 1], 'wo') diff --git a/GPy/core/svigp.py b/GPy/core/svigp.py new file mode 100644 index 00000000..b0175a39 --- /dev/null +++ b/GPy/core/svigp.py @@ -0,0 +1,518 @@ +# Copyright (c) 2012, James Hensman and Nicolo' Fusi +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +import pylab as pb +from .. import kern +from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs, jitchol, backsub_both_sides +from ..likelihoods import EP +from gp_base import GPBase +from model import Model +import time +import sys + + +class SVIGP(GPBase): + """ + Stochastic Variational inference in a Gaussian Process + + :param X: inputs + :type X: np.ndarray (N x Q) + :param Y: observed data + :type Y: np.ndarray of observations (N x D) + :param batchsize: the size of a h + + Additional kwargs are used as for a sparse GP. They include + + :param q_u: canonical parameters of the distribution squasehd into a 1D array + :type q_u: np.ndarray + :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) + :type M: int + :param kernel : the kernel/covariance function. See link kernels + :type kernel: a GPy kernel + :param Z: inducing inputs (optional, see note) + :type Z: np.ndarray (M x Q) | None + :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance) + :type X_uncertainty: np.ndarray (N x Q) | None + :param Zslices: slices for the inducing inputs (see slicing TODO: link) + :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) + :type M: int + :param beta: noise precision. TODO> ignore beta if doing EP + :type beta: float + :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) + :type normalize_(X|Y): bool + """ + + + def __init__(self, X, likelihood, kernel, Z, q_u=None, batchsize=10, X_variance=None): + GPBase.__init__(self, X, likelihood, kernel, normalize_X=False) + self.batchsize=batchsize + self.Y = self.likelihood.Y.copy() + self.Z = Z + self.num_inducing = Z.shape[0] + + self.batchcounter = 0 + self.epochs = 0 + self.iterations = 0 + + self.vb_steplength = 0.05 + self.param_steplength = 1e-5 + self.momentum = 0.9 + + if X_variance is None: + self.has_uncertain_inputs = False + else: + self.has_uncertain_inputs = True + self.X_variance = X_variance + + + if q_u is None: + q_u = np.hstack((np.random.randn(self.num_inducing*self.output_dim),-.5*np.eye(self.num_inducing).flatten())) + self.set_vb_param(q_u) + + self._permutation = np.random.permutation(self.num_data) + self.load_batch() + + self._param_trace = [] + self._ll_trace = [] + self._grad_trace = [] + + #set the adaptive steplength parameters + self.hbar_t = 0.0 + self.tau_t = 100.0 + self.gbar_t = 0.0 + self.gbar_t1 = 0.0 + self.gbar_t2 = 0.0 + self.hbar_tp = 0.0 + self.tau_tp = 10000.0 + self.gbar_tp = 0.0 + self.adapt_param_steplength = True + self.adapt_vb_steplength = True + self._param_steplength_trace = [] + self._vb_steplength_trace = [] + + def getstate(self): + steplength_params = [self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength] + return GPBase.getstate(self) + \ + [self.get_vb_param(), + self.Z, + self.num_inducing, + self.has_uncertain_inputs, + self.X_variance, + self.X_batch, + self.X_variance_batch, + steplength_params, + self.batchcounter, + self.batchsize, + self.epochs, + self.momentum, + self.data_prop, + self._param_trace, + self._param_steplength_trace, + self._vb_steplength_trace, + self._ll_trace, + self._grad_trace, + self.Y, + self._permutation, + self.iterations + ] + + def setstate(self, state): + self.iterations = state.pop() + self._permutation = state.pop() + self.Y = state.pop() + self._grad_trace = state.pop() + self._ll_trace = state.pop() + self._vb_steplength_trace = state.pop() + self._param_steplength_trace = state.pop() + self._param_trace = state.pop() + self.data_prop = state.pop() + self.momentum = state.pop() + self.epochs = state.pop() + self.batchsize = state.pop() + self.batchcounter = state.pop() + steplength_params = state.pop() + (self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength) = steplength_params + self.X_variance_batch = state.pop() + self.X_batch = state.pop() + self.X_variance = state.pop() + self.has_uncertain_inputs = state.pop() + self.num_inducing = state.pop() + self.Z = state.pop() + vb_param = state.pop() + GPBase.setstate(self, state) + self.set_vb_param(vb_param) + + def _compute_kernel_matrices(self): + # kernel computations, using BGPLVM notation + self.Kmm = self.kern.K(self.Z) + if self.has_uncertain_inputs: + self.psi0 = self.kern.psi0(self.Z, self.X_batch, self.X_variance_batch) + self.psi1 = self.kern.psi1(self.Z, self.X_batch, self.X_variance_batch) + self.psi2 = self.kern.psi2(self.Z, self.X_batch, self.X_variance_batch) + else: + self.psi0 = self.kern.Kdiag(self.X_batch) + self.psi1 = self.kern.K(self.X_batch, self.Z) + self.psi2 = None + + def dL_dtheta(self): + dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z) + if self.has_uncertain_inputs: + dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X_batch, self.X_variance_batch) + dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X_batch, self.X_variance_batch) + dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X_batch, self.X_variance_batch) + else: + dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X_batch, self.Z) + dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X_batch) + return dL_dtheta + + def _set_params(self, p, computations=True): + self.kern._set_params_transformed(p[:self.kern.num_params]) + self.likelihood._set_params(p[self.kern.num_params:]) + if computations: + self._compute_kernel_matrices() + self._computations() + + def _get_params(self): + return np.hstack((self.kern._get_params_transformed() , self.likelihood._get_params())) + + def _get_param_names(self): + return self.kern._get_param_names_transformed() + self.likelihood._get_param_names() + + def load_batch(self): + """ + load a batch of data (set self.X_batch and self.likelihood.Y from self.X, self.Y) + """ + + #if we've seen all the data, start again with them in a new random order + if self.batchcounter+self.batchsize > self.num_data: + self.batchcounter = 0 + self.epochs += 1 + self._permutation = np.random.permutation(self.num_data) + + this_perm = self._permutation[self.batchcounter:self.batchcounter+self.batchsize] + + self.X_batch = self.X[this_perm] + self.likelihood.set_data(self.Y[this_perm]) + if self.has_uncertain_inputs: + self.X_variance_batch = self.X_variance[this_perm] + + self.batchcounter += self.batchsize + + self.data_prop = float(self.batchsize)/self.num_data + + self._compute_kernel_matrices() + self._computations() + + def _computations(self,do_Kmm=True, do_Kmm_grad=True): + """ + All of the computations needed. Some are optional, see kwargs. + """ + + if do_Kmm: + self.Lm = jitchol(self.Kmm) + + # The rather complex computations of self.A + if self.has_uncertain_inputs: + if self.likelihood.is_heteroscedastic: + psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.batchsize, 1, 1))).sum(0) + else: + psi2_beta = self.psi2.sum(0) * self.likelihood.precision + evals, evecs = np.linalg.eigh(psi2_beta) + clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable + tmp = evecs * np.sqrt(clipped_evals) + else: + if self.likelihood.is_heteroscedastic: + tmp = self.psi1.T * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.batchsize))) + else: + tmp = self.psi1.T * (np.sqrt(self.likelihood.precision)) + tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1) + self.A = tdot(tmp) + + self.V = self.likelihood.precision*self.likelihood.Y + self.VmT = np.dot(self.V,self.q_u_expectation[0].T) + self.psi1V = np.dot(self.psi1.T, self.V) + + self.B = np.eye(self.num_inducing)*self.data_prop + self.A + self.Lambda = backsub_both_sides(self.Lm, self.B.T) + self.LQL = backsub_both_sides(self.Lm,self.q_u_expectation[1].T,transpose='right') + + self.trace_K = self.psi0.sum() - np.trace(self.A)/self.likelihood.precision + self.Kmmi_m, _ = dpotrs(self.Lm, self.q_u_expectation[0], lower=1) + self.projected_mean = np.dot(self.psi1,self.Kmmi_m) + + # Compute dL_dpsi + self.dL_dpsi0 = - 0.5 * self.output_dim * self.likelihood.precision * np.ones(self.batchsize) + self.dL_dpsi1, _ = dpotrs(self.Lm,np.asfortranarray(self.VmT.T),lower=1) + self.dL_dpsi1 = self.dL_dpsi1.T + + dL_dpsi2 = -0.5 * self.likelihood.precision * backsub_both_sides(self.Lm, self.LQL - self.output_dim * np.eye(self.num_inducing)) + if self.has_uncertain_inputs: + self.dL_dpsi2 = np.repeat(dL_dpsi2[None,:,:],self.batchsize,axis=0) + else: + self.dL_dpsi1 += 2.*np.dot(dL_dpsi2,self.psi1.T).T + self.dL_dpsi2 = None + + # Compute dL_dKmm + if do_Kmm_grad: + tmp = np.dot(self.LQL,self.A) - backsub_both_sides(self.Lm,np.dot(self.q_u_expectation[0],self.psi1V.T),transpose='right') + tmp += tmp.T + tmp += -self.output_dim*self.B + tmp += self.data_prop*self.LQL + self.dL_dKmm = 0.5*backsub_both_sides(self.Lm,tmp) + + #Compute the gradient of the log likelihood wrt noise variance + self.partial_for_likelihood = -0.5*(self.batchsize*self.output_dim - np.sum(self.A*self.LQL))*self.likelihood.precision + self.partial_for_likelihood += (0.5*self.output_dim*self.trace_K + 0.5 * self.likelihood.trYYT - np.sum(self.likelihood.Y*self.projected_mean))*self.likelihood.precision**2 + + + def log_likelihood(self): + """ + As for uncollapsed sparse GP, but account for the proportion of data we're looking at right now. + + NB. self.batchsize is the size of the batch, not the size of X_all + """ + assert not self.likelihood.is_heteroscedastic + A = -0.5*self.batchsize*self.output_dim*(np.log(2.*np.pi) - np.log(self.likelihood.precision)) + B = -0.5*self.likelihood.precision*self.output_dim*self.trace_K + Kmm_logdet = 2.*np.sum(np.log(np.diag(self.Lm))) + C = -0.5*self.output_dim*self.data_prop*(Kmm_logdet-self.q_u_logdet - self.num_inducing) + C += -0.5*np.sum(self.LQL * self.B) + D = -0.5*self.likelihood.precision*self.likelihood.trYYT + E = np.sum(self.V*self.projected_mean) + return (A+B+C+D+E)/self.data_prop + + def _log_likelihood_gradients(self): + return np.hstack((self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))/self.data_prop + + def vb_grad_natgrad(self): + """ + Compute the gradients of the lower bound wrt the canonical and + Expectation parameters of u. + + Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family) + """ + + # Gradient for eta + dL_dmmT_S = -0.5*self.Lambda/self.data_prop + 0.5*self.q_u_prec + Kmmipsi1V,_ = dpotrs(self.Lm,self.psi1V,lower=1) + dL_dm = (Kmmipsi1V - np.dot(self.Lambda,self.q_u_mean))/self.data_prop + + # Gradients for theta + S = self.q_u_cov + Si = self.q_u_prec + m = self.q_u_mean + dL_dSi = -mdot(S,dL_dmmT_S, S) + + dL_dmhSi = -2*dL_dSi + dL_dSim = np.dot(dL_dSi,m) + np.dot(Si, dL_dm) + + return np.hstack((dL_dm.flatten(),dL_dmmT_S.flatten())) , np.hstack((dL_dSim.flatten(), dL_dmhSi.flatten())) + + + def optimize(self, iterations, print_interval=10, callback=lambda:None, callback_interval=5): + + param_step = 0. + + #Iterate! + for i in range(iterations): + + #store the current configuration for plotting later + self._param_trace.append(self._get_params()) + self._ll_trace.append(self.log_likelihood() + self.log_prior()) + + #load a batch + self.load_batch() + + #compute the (stochastic) gradient + natgrads = self.vb_grad_natgrad() + grads = self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()) + self._grad_trace.append(grads) + + #compute the steps in all parameters + vb_step = self.vb_steplength*natgrads[0] + if (self.epochs>=1):#only move the parameters after the first epoch + param_step = self.momentum*param_step + self.param_steplength*grads + else: + param_step = 0. + + self.set_vb_param(self.get_vb_param() + vb_step) + #Note: don't recompute everything here, wait until the next iteration when we have a new batch + self._set_params(self._untransform_params(self._get_params_transformed() + param_step), computations=False) + + #print messages if desired + if i and (not i%print_interval): + print i, np.mean(self._ll_trace[-print_interval:]) #, self.log_likelihood() + print np.round(np.mean(self._grad_trace[-print_interval:],0),3) + sys.stdout.flush() + + #callback + if i and not i%callback_interval: + callback() + time.sleep(0.1) + + if self.epochs > 10: + self._adapt_steplength() + + self.iterations += 1 + + + def _adapt_steplength(self): + if self.adapt_vb_steplength: + # self._adaptive_vb_steplength() + self._adaptive_vb_steplength_KL() + self._vb_steplength_trace.append(self.vb_steplength) + assert self.vb_steplength > 0 + + if self.adapt_param_steplength: + # self._adaptive_param_steplength() + # self._adaptive_param_steplength_log() + self._adaptive_param_steplength_from_vb() + self._param_steplength_trace.append(self.param_steplength) + + def _adaptive_param_steplength(self): + decr_factor = 0.1 + g_tp = self._transform_gradients(self._log_likelihood_gradients()) + self.gbar_tp = (1-1/self.tau_tp)*self.gbar_tp + 1/self.tau_tp * g_tp + self.hbar_tp = (1-1/self.tau_tp)*self.hbar_tp + 1/self.tau_tp * np.dot(g_tp.T, g_tp) + new_param_steplength = np.dot(self.gbar_tp.T, self.gbar_tp) / self.hbar_tp + #- hack + new_param_steplength *= decr_factor + self.param_steplength = (self.param_steplength + new_param_steplength)/2 + #- + self.tau_tp = self.tau_tp*(1-self.param_steplength) + 1 + + def _adaptive_param_steplength_log(self): + stp = np.logspace(np.log(0.0001), np.log(1e-6), base=np.e, num=18000) + self.param_steplength = stp[self.iterations] + + def _adaptive_param_steplength_log2(self): + self.param_steplength = (self.iterations + 0.001)**-0.5 + + def _adaptive_param_steplength_from_vb(self): + self.param_steplength = self.vb_steplength * 0.01 + + def _adaptive_vb_steplength(self): + decr_factor = 0.1 + g_t = self.vb_grad_natgrad()[0] + self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t + self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t) + new_vb_steplength = np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t + #- hack + new_vb_steplength *= decr_factor + self.vb_steplength = (self.vb_steplength + new_vb_steplength)/2 + #- + self.tau_t = self.tau_t*(1-self.vb_steplength) + 1 + + def _adaptive_vb_steplength_KL(self): + decr_factor = 1 #0.1 + natgrad = self.vb_grad_natgrad() + g_t1 = natgrad[0] + g_t2 = natgrad[1] + self.gbar_t1 = (1-1/self.tau_t)*self.gbar_t1 + 1/self.tau_t * g_t1 + self.gbar_t2 = (1-1/self.tau_t)*self.gbar_t2 + 1/self.tau_t * g_t2 + self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t1.T, g_t2) + self.vb_steplength = np.dot(self.gbar_t1.T, self.gbar_t2) / self.hbar_t + self.vb_steplength *= decr_factor + self.tau_t = self.tau_t*(1-self.vb_steplength) + 1 + + def _raw_predict(self, X_new, X_variance_new=None, which_parts='all',full_cov=False): + """Internal helper function for making predictions, does not account for normalization""" + + #TODO: make this more efficient! + self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) + tmp = self.Kmmi- mdot(self.Kmmi,self.q_u_cov,self.Kmmi) + + if X_variance_new is None: + Kx = self.kern.K(X_new,self.Z) + mu = np.dot(Kx,self.Kmmi_m) + if full_cov: + Kxx = self.kern.K(X_new) + var = Kxx - mdot(Kx,tmp,Kx.T) + else: + Kxx = self.kern.Kdiag(X_new) + var = (Kxx - np.sum(Kx*np.dot(Kx,tmp),1))[:,None] + return mu, var + else: + assert X_variance_new.shape == X_new.shape + Kx = self.kern.psi1(self.Z,X_new, X_variance_new) + mu = np.dot(Kx,self.Kmmi_m) + Kxx = self.kern.psi0(self.Z,X_new,X_variance_new) + psi2 = self.kern.psi2(self.Z,X_new,X_variance_new) + diag_var = Kxx - np.sum(np.sum(psi2*tmp[None,:,:],1),1) + if full_cov: + raise NotImplementedError + else: + return mu, diag_var[:,None] + + def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False): + # normalize X values + Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale + if X_variance_new is not None: + X_variance_new = X_variance_new / self._Xscale ** 2 + + # here's the actual prediction by the GP model + mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts) + + # now push through likelihood + mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov) + + return mean, var, _025pm, _975pm + + + def set_vb_param(self,vb_param): + """set the distribution q(u) from the canonical parameters""" + self.q_u_canonical_flat = vb_param.copy() + self.q_u_canonical = self.q_u_canonical_flat[:self.num_inducing*self.output_dim].reshape(self.num_inducing,self.output_dim),self.q_u_canonical_flat[self.num_inducing*self.output_dim:].reshape(self.num_inducing,self.num_inducing) + + self.q_u_prec = -2.*self.q_u_canonical[1] + self.q_u_cov, q_u_Li, q_u_L, tmp = pdinv(self.q_u_prec) + self.q_u_Li = q_u_Li + self.q_u_logdet = -tmp + self.q_u_mean, _ = dpotrs(q_u_Li, np.asfortranarray(self.q_u_canonical[0]),lower=1) + + self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov*self.output_dim) + + + def get_vb_param(self): + """ + Return the canonical parameters of the distribution q(u) + """ + return self.q_u_canonical_flat + + + def plot(self, ax=None, fignum=None, Z_height=None, **kwargs): + + if ax is None: + fig = pb.figure(num=fignum) + ax = fig.add_subplot(111) + + #horrible hack here: + data = self.likelihood.data.copy() + self.likelihood.data = self.Y + GPBase.plot(self, ax=ax, **kwargs) + self.likelihood.data = data + + Zu = self.Z * self._Xscale + self._Xoffset + if self.input_dim==1: + ax.plot(self.X_batch, self.likelihood.data, 'gx',mew=2) + if Z_height is None: + Z_height = ax.get_ylim()[0] + ax.plot(Zu, np.zeros_like(Zu) + Z_height, 'r|', mew=1.5, markersize=12) + + if self.input_dim==2: + ax.scatter(self.X[:,0], self.X[:,1], 20., self.Y[:,0], linewidth=0, cmap=pb.cm.jet) + ax.plot(Zu[:,0], Zu[:,1], 'w^') + + def plot_traces(self): + pb.figure() + t = np.array(self._param_trace) + pb.subplot(2,1,1) + for l,ti in zip(self._get_param_names(),t.T): + if not l[:3]=='iip': + pb.plot(ti,label=l) + pb.legend(loc=0) + + pb.subplot(2,1,2) + pb.plot(np.asarray(self._ll_trace),label='stochastic likelihood') + pb.legend(loc=0) diff --git a/GPy/core/transformations.py b/GPy/core/transformations.py new file mode 100644 index 00000000..419bc54e --- /dev/null +++ b/GPy/core/transformations.py @@ -0,0 +1,144 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +from GPy.core.domains import POSITIVE, NEGATIVE, BOUNDED +import sys +lim_val = -np.log(sys.float_info.epsilon) + +class transformation(object): + domain = None + def f(self, x): + raise NotImplementedError + + def finv(self, x): + raise NotImplementedError + + def gradfactor(self, f): + """ df_dx evaluated at self.f(x)=f""" + raise NotImplementedError + def initialize(self, f): + """ produce a sensible initial value for f(x)""" + raise NotImplementedError + def __str__(self): + raise NotImplementedError + +class logexp(transformation): + domain = POSITIVE + def f(self, x): + return np.where(x>lim_val, x, np.log(1. + np.exp(x))) + def finv(self, f): + return np.where(f>lim_val, f, np.log(np.exp(f) - 1.)) + def gradfactor(self, f): + return np.where(f>lim_val, 1., 1 - np.exp(-f)) + def initialize(self, f): + if np.any(f < 0.): + print "Warning: changing parameters to satisfy constraints" + return np.abs(f) + def __str__(self): + return '(+ve)' + +class negative_logexp(transformation): + domain = NEGATIVE + def f(self, x): + return -logexp.f(x) #np.log(1. + np.exp(x)) + def finv(self, f): + return logexp.finv(-f) #np.log(np.exp(-f) - 1.) + def gradfactor(self, f): + return -logexp.gradfactor(-f) + #ef = np.exp(-f) + #return -(ef - 1.) / ef + def initialize(self, f): + return -logexp.initialize(f) #np.abs(f) + def __str__(self): + return '(-ve)' + +class logexp_clipped(logexp): + max_bound = 1e100 + min_bound = 1e-10 + log_max_bound = np.log(max_bound) + log_min_bound = np.log(min_bound) + domain = POSITIVE + def __init__(self, lower=1e-6): + self.lower = lower + def f(self, x): + exp = np.exp(np.clip(x, self.log_min_bound, self.log_max_bound)) + f = np.log(1. + exp) +# if np.isnan(f).any(): +# import ipdb;ipdb.set_trace() + return np.clip(f, self.min_bound, self.max_bound) + def finv(self, f): + return np.log(np.exp(f - 1.)) + def gradfactor(self, f): + ef = np.exp(f) # np.clip(f, self.min_bound, self.max_bound)) + gf = (ef - 1.) / ef + return gf # np.where(f < self.lower, 0, gf) + def initialize(self, f): + if np.any(f < 0.): + print "Warning: changing parameters to satisfy constraints" + return np.abs(f) + def __str__(self): + return '(+ve_c)' + +class exponent(transformation): + # TODO: can't allow this to go to zero, need to set a lower bound. Similar with negative exponent below. See old MATLAB code. + domain = POSITIVE + def f(self, x): + return np.where(x-lim_val, np.exp(x), np.exp(-lim_val)), np.exp(lim_val)) + def finv(self, x): + return np.log(x) + def gradfactor(self, f): + return f + def initialize(self, f): + if np.any(f < 0.): + print "Warning: changing parameters to satisfy constraints" + return np.abs(f) + def __str__(self): + return '(+ve)' + +class negative_exponent(exponent): + domain = NEGATIVE + def f(self, x): + return -exponent.f(x) + def finv(self, f): + return exponent.finv(-f) + def gradfactor(self, f): + return f + def initialize(self, f): + return -exponent.initialize(f) #np.abs(f) + def __str__(self): + return '(-ve)' + +class square(transformation): + domain = POSITIVE + def f(self, x): + return x ** 2 + def finv(self, x): + return np.sqrt(x) + def gradfactor(self, f): + return 2 * np.sqrt(f) + def initialize(self, f): + return np.abs(f) + def __str__(self): + return '(+sq)' + +class logistic(transformation): + domain = BOUNDED + def __init__(self, lower, upper): + assert lower < upper + self.lower, self.upper = float(lower), float(upper) + self.difference = self.upper - self.lower + def f(self, x): + return self.lower + self.difference / (1. + np.exp(-x)) + def finv(self, f): + return np.log(np.clip(f - self.lower, 1e-10, np.inf) / np.clip(self.upper - f, 1e-10, np.inf)) + def gradfactor(self, f): + return (f - self.lower) * (self.upper - f) / self.difference + def initialize(self, f): + if np.any(np.logical_or(f < self.lower, f > self.upper)): + print "Warning: changing parameters to satisfy constraints" + return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f) + def __str__(self): + return '({},{})'.format(self.lower, self.upper) + diff --git a/GPy/examples/__init__.py b/GPy/examples/__init__.py new file mode 100644 index 00000000..2f74858a --- /dev/null +++ b/GPy/examples/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import classification +import regression +import dimensionality_reduction +import tutorials +import stochastic diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py new file mode 100644 index 00000000..77d1982c --- /dev/null +++ b/GPy/examples/classification.py @@ -0,0 +1,168 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +""" +Gaussian Processes classification +""" +import pylab as pb +import numpy as np +import GPy + +default_seed = 10000 +def crescent_data(seed=default_seed, kernel=None): # FIXME + """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood. + + :param model_type: type of model to fit ['Full', 'FITC', 'DTC']. + :param seed : seed value for data generation. + :type seed: int + :param inducing : number of inducing variables (only used for 'FITC' or 'DTC'). + :type inducing: int + """ + + data = GPy.util.datasets.crescent_data(seed=seed) + Y = data['Y'] + Y[Y.flatten()==-1] = 0 + + m = GPy.models.GPClassification(data['X'], Y) + #m.update_likelihood_approximation() + #m.optimize() + m.pseudo_EM() + print(m) + m.plot() + return m + +def oil(num_inducing=50, max_iters=100, kernel=None): + """ + Run a Gaussian process classification on the three phase oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood. + """ + data = GPy.util.datasets.oil() + X = data['X'] + Xtest = data['Xtest'] + Y = data['Y'][:, 0:1] + Ytest = data['Ytest'][:, 0:1] + Y[Y.flatten()==-1] = 0 + Ytest[Ytest.flatten()==-1] = 0 + + # Create GP model + m = GPy.models.SparseGPClassification(X, Y,kernel=kernel,num_inducing=num_inducing) + + # Contrain all parameters to be positive + m.tie_params('.*len') + m['.*len'] = 10. + m.update_likelihood_approximation() + + # Optimize + m.optimize(max_iters=max_iters) + print(m) + + #Test + probs = m.predict(Xtest)[0] + GPy.util.classification.conf_matrix(probs,Ytest) + return m + +def toy_linear_1d_classification(seed=default_seed): + """ + Simple 1D classification example + :param seed : seed value for data generation (default is 4). + :type seed: int + """ + + data = GPy.util.datasets.toy_linear_1d_classification(seed=seed) + Y = data['Y'][:, 0:1] + Y[Y.flatten() == -1] = 0 + + # Model definition + m = GPy.models.GPClassification(data['X'], Y) + + # Optimize + #m.update_likelihood_approximation() + # Parameters optimization: + #m.optimize() + m.pseudo_EM() + + # Plot + fig, axes = pb.subplots(2,1) + m.plot_f(ax=axes[0]) + m.plot(ax=axes[1]) + print(m) + + return m + +def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed): + """ + Sparse 1D classification example + :param seed : seed value for data generation (default is 4). + :type seed: int + """ + + data = GPy.util.datasets.toy_linear_1d_classification(seed=seed) + Y = data['Y'][:, 0:1] + Y[Y.flatten() == -1] = 0 + + # Model definition + m = GPy.models.SparseGPClassification(data['X'], Y,num_inducing=num_inducing) + m['.*len']= 4. + + # Optimize + #m.update_likelihood_approximation() + # Parameters optimization: + #m.optimize() + m.pseudo_EM() + + # Plot + fig, axes = pb.subplots(2,1) + m.plot_f(ax=axes[0]) + m.plot(ax=axes[1]) + print(m) + + return m + +def sparse_crescent_data(num_inducing=10, seed=default_seed, kernel=None): + """ + Run a Gaussian process classification with DTC approxiamtion on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood. + + :param model_type: type of model to fit ['Full', 'FITC', 'DTC']. + :param seed : seed value for data generation. + :type seed: int + :param inducing : number of inducing variables (only used for 'FITC' or 'DTC'). + :type inducing: int + """ + + data = GPy.util.datasets.crescent_data(seed=seed) + Y = data['Y'] + Y[Y.flatten()==-1]=0 + + m = GPy.models.SparseGPClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing) + m['.*len'] = 10. + #m.update_likelihood_approximation() + #m.optimize() + m.pseudo_EM() + print(m) + m.plot() + return m + +def FITC_crescent_data(num_inducing=10, seed=default_seed): + """ + Run a Gaussian process classification with FITC approximation on the crescent data. The demonstration uses EP to approximate the likelihood. + + :param model_type: type of model to fit ['Full', 'FITC', 'DTC']. + :param seed : seed value for data generation. + :type seed: int + :param inducing : number of inducing variables (only used for 'FITC' or 'DTC'). + :type num_inducing: int + """ + + data = GPy.util.datasets.crescent_data(seed=seed) + Y = data['Y'] + Y[Y.flatten()==-1]=0 + + m = GPy.models.FITCClassification(data['X'], Y,num_inducing=num_inducing) + m.constrain_bounded('.*len',1.,1e3) + m['.*len'] = 3. + #m.update_likelihood_approximation() + #m.optimize() + m.pseudo_EM() + print(m) + m.plot() + return m diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py new file mode 100644 index 00000000..005b131f --- /dev/null +++ b/GPy/examples/dimensionality_reduction.py @@ -0,0 +1,492 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +from matplotlib import pyplot as plt, cm + +import GPy +from GPy.core.transformations import logexp +from GPy.models.bayesian_gplvm import BayesianGPLVM +from GPy.likelihoods.gaussian import Gaussian + +default_seed = np.random.seed(123344) + +def BGPLVM(seed=default_seed): + N = 5 + num_inducing = 4 + Q = 3 + D = 2 + # generate GPLVM-like data + X = np.random.rand(N, Q) + lengthscales = np.random.rand(Q) + k = (GPy.kern.rbf(Q, .5, lengthscales, ARD=True) + + GPy.kern.white(Q, 0.01)) + K = k.K(X) + Y = np.random.multivariate_normal(np.zeros(N), K, D).T + lik = Gaussian(Y, normalize=True) + + k = GPy.kern.rbf_inv(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q) + # k = GPy.kern.rbf(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001) + # k = GPy.kern.rbf(Q, ARD = False) + GPy.kern.white(Q, 0.00001) + + m = GPy.models.BayesianGPLVM(lik, Q, kernel=k, num_inducing=num_inducing) + m.lengthscales = lengthscales + # m.constrain_positive('(rbf|bias|noise|white|S)') + # m.constrain_fixed('S', 1) + + # pb.figure() + # m.plot() + # pb.title('PCA initialisation') + # pb.figure() + # m.optimize(messages = 1) + # m.plot() + # pb.title('After optimisation') + # m.randomize() + # m.checkgrad(verbose=1) + + return m + +def GPLVM_oil_100(optimize=True): + data = GPy.util.datasets.oil_100() + Y = data['X'] + + # create simple GP model + kernel = GPy.kern.rbf(6, ARD=True) + GPy.kern.bias(6) + m = GPy.models.GPLVM(Y, 6, kernel=kernel) + m.data_labels = data['Y'].argmax(axis=1) + + # optimize + if optimize: + m.optimize('scg', messages=1) + + # plot + print(m) + m.plot_latent(labels=m.data_labels) + return m + +def sparseGPLVM_oil(optimize=True, N=100, Q=6, num_inducing=15, max_iters=50): + np.random.seed(0) + data = GPy.util.datasets.oil() + + Y = data['X'][:N] + Y = Y - Y.mean(0) + Y /= Y.std(0) + + # create simple GP model + kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q) + m = GPy.models.SparseGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing) + m.data_labels = data['Y'].argmax(axis=1) + + # optimize + if optimize: + m.optimize('scg', messages=1, max_iters=max_iters) + + # plot + print(m) + # m.plot_latent(labels=m.data_labels) + return m + +def swiss_roll(optimize=True, N=1000, num_inducing=15, Q=4, sigma=.2, plot=False): + from GPy.util.datasets import swiss_roll_generated + from GPy.core.transformations import logexp_clipped + + data = swiss_roll_generated(N=N, sigma=sigma) + Y = data['Y'] + Y -= Y.mean() + Y /= Y.std() + + t = data['t'] + c = data['colors'] + + try: + from sklearn.manifold.isomap import Isomap + iso = Isomap().fit(Y) + X = iso.embedding_ + if Q > 2: + X = np.hstack((X, np.random.randn(N, Q - 2))) + except ImportError: + X = np.random.randn(N, Q) + + if plot: + from mpl_toolkits import mplot3d + import pylab + fig = pylab.figure("Swiss Roll Data") + ax = fig.add_subplot(121, projection='3d') + ax.scatter(*Y.T, c=c) + ax.set_title("Swiss Roll") + + ax = fig.add_subplot(122) + ax.scatter(*X.T[:2], c=c) + ax.set_title("Initialization") + + + var = .5 + S = (var * np.ones_like(X) + np.clip(np.random.randn(N, Q) * var ** 2, + - (1 - var), + (1 - var))) + .001 + Z = np.random.permutation(X)[:num_inducing] + + kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2)) + + m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel) + m.data_colors = c + m.data_t = t + + m['rbf_lengthscale'] = 1. # X.var(0).max() / X.var(0) + m['noise_variance'] = Y.var() / 100. + m['bias_variance'] = 0.05 + + if optimize: + m.optimize('scg', messages=1) + return m + +def BGPLVM_oil(optimize=True, N=200, Q=7, num_inducing=40, max_iters=1000, plot=False, **k): + np.random.seed(0) + data = GPy.util.datasets.oil() + + # create simple GP model + kernel = GPy.kern.rbf_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + + Y = data['X'][:N] + Yn = Gaussian(Y, normalize=True) +# Yn = Y - Y.mean(0) +# Yn /= Yn.std(0) + + m = GPy.models.BayesianGPLVM(Yn, Q, kernel=kernel, num_inducing=num_inducing, **k) + m.data_labels = data['Y'][:N].argmax(axis=1) + + # m.constrain('variance|leng', logexp_clipped()) + # m['.*lengt'] = m.X.var(0).max() / m.X.var(0) + m['noise'] = Yn.Y.var() / 100. + + + # optimize + if optimize: + m.constrain_fixed('noise') + m.optimize('scg', messages=1, max_iters=200, gtol=.05) + m.constrain_positive('noise') + m.constrain_bounded('white', 1e-7, 1) + m.optimize('scg', messages=1, max_iters=max_iters, gtol=.05) + + if plot: + y = m.likelihood.Y[0, :] + fig, (latent_axes, sense_axes) = plt.subplots(1, 2) + plt.sca(latent_axes) + m.plot_latent() + data_show = GPy.util.visualize.vector_show(y) + lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], m, data_show, latent_axes=latent_axes) # , sense_axes=sense_axes) + raw_input('Press enter to finish') + plt.close(fig) + return m + +def oil_100(): + data = GPy.util.datasets.oil_100() + m = GPy.models.GPLVM(data['X'], 2) + + # optimize + m.optimize(messages=1, max_iters=2) + + # plot + print(m) + # m.plot_latent(labels=data['Y'].argmax(axis=1)) + return m + + + +def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False): + x = np.linspace(0, 4 * np.pi, N)[:, None] + s1 = np.vectorize(lambda x: np.sin(x)) + s2 = np.vectorize(lambda x: np.cos(x)) + s3 = np.vectorize(lambda x:-np.exp(-np.cos(2 * x))) + sS = np.vectorize(lambda x: np.sin(2 * x)) + + s1 = s1(x) + s2 = s2(x) + s3 = s3(x) + sS = sS(x) + + S1 = np.hstack([s1, sS]) + S2 = np.hstack([s2, s3, sS]) + S3 = np.hstack([s3, sS]) + + Y1 = S1.dot(np.random.randn(S1.shape[1], D1)) + Y2 = S2.dot(np.random.randn(S2.shape[1], D2)) + Y3 = S3.dot(np.random.randn(S3.shape[1], D3)) + + Y1 += .3 * np.random.randn(*Y1.shape) + Y2 += .2 * np.random.randn(*Y2.shape) + Y3 += .25 * np.random.randn(*Y3.shape) + + Y1 -= Y1.mean(0) + Y2 -= Y2.mean(0) + Y3 -= Y3.mean(0) + Y1 /= Y1.std(0) + Y2 /= Y2.std(0) + Y3 /= Y3.std(0) + + slist = [sS, s1, s2, s3] + slist_names = ["sS", "s1", "s2", "s3"] + Ylist = [Y1, Y2, Y3] + + if plot_sim: + import pylab + import itertools + fig = pylab.figure("MRD Simulation Data", figsize=(8, 6)) + fig.clf() + ax = fig.add_subplot(2, 1, 1) + labls = slist_names + for S, lab in itertools.izip(slist, labls): + ax.plot(S, label=lab) + ax.legend() + for i, Y in enumerate(Ylist): + ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i) + ax.imshow(Y, aspect='auto', cmap=cm.gray) # @UndefinedVariable + ax.set_title("Y{}".format(i + 1)) + pylab.draw() + pylab.tight_layout() + + return slist, [S1, S2, S3], Ylist + +def bgplvm_simulation_matlab_compare(): + from GPy.util.datasets import simulation_BGPLVM + sim_data = simulation_BGPLVM() + Y = sim_data['Y'] + S = sim_data['S'] + mu = sim_data['mu'] + num_inducing, [_, Q] = 3, mu.shape + + from GPy.models import mrd + from GPy import kern + reload(mrd); reload(kern) + k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2)) + m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k, +# X=mu, +# X_variance=S, + _debug=False) + m.auto_scale_factor = True + m['noise'] = Y.var() / 100. + m['linear_variance'] = .01 + return m + +def bgplvm_simulation(optimize='scg', + plot=True, + max_iters=2e4, + plot_sim=False): +# from GPy.core.transformations import logexp_clipped + D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10 + slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim) + + from GPy.models import mrd + from GPy import kern + reload(mrd); reload(kern) + + Y = Ylist[0] + + k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2)) # + kern.bias(Q) + m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k) + + # m.constrain('variance|noise', logexp_clipped()) + m['noise'] = Y.var() / 100. + + if optimize: + print "Optimizing model:" + m.optimize(optimize, max_iters=max_iters, + messages=True, gtol=.05) + if plot: + m.plot_X_1d("BGPLVM Latent Space 1D") + m.kern.plot_ARD('BGPLVM Simulation ARD Parameters') + return m + +def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw): + D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5 + slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim) + + likelihood_list = [Gaussian(x, normalize=True) for x in Ylist] + + from GPy.models import mrd + from GPy import kern + + reload(mrd); reload(kern) + + k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2)) + m = mrd.MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw) + m.ensure_default_constraints() + + for i, bgplvm in enumerate(m.bgplvms): + m['{}_noise'.format(i)] = bgplvm.likelihood.Y.var() / 500. + + + # DEBUG + # np.seterr("raise") + + if optimize: + print "Optimizing Model:" + m.optimize(messages=1, max_iters=8e3, gtol=.1) + if plot: + m.plot_X_1d("MRD Latent Space 1D") + m.plot_scales("MRD Scales") + return m + +def brendan_faces(): + from GPy import kern + data = GPy.util.datasets.brendan_faces() + Q = 2 + Y = data['Y'][0:-1:10, :] + # Y = data['Y'] + Yn = Y - Y.mean() + Yn /= Yn.std() + + m = GPy.models.GPLVM(Yn, Q) + # m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=100) + + # optimize + m.constrain('rbf|noise|white', GPy.core.transformations.logexp_clipped()) + + m.optimize('scg', messages=1, max_f_eval=10000) + + ax = m.plot_latent(which_indices=(0, 1)) + y = m.likelihood.Y[0, :] + data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, invert=False, scale=False) + lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax) + raw_input('Press enter to finish') + + return m +def stick_play(range=None, frame_rate=15): + data = GPy.util.datasets.osu_run1() + # optimize + if range == None: + Y = data['Y'].copy() + else: + Y = data['Y'][range[0]:range[1], :].copy() + y = Y[0, :] + data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect']) + GPy.util.visualize.data_play(Y, data_show, frame_rate) + return Y + +def stick(kernel=None): + data = GPy.util.datasets.osu_run1() + # optimize + m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel) + m.optimize(messages=1, max_f_eval=10000) + if GPy.util.visualize.visual_available: + plt.clf + ax = m.plot_latent() + y = m.likelihood.Y[0, :] + data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect']) + lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax) + raw_input('Press enter to finish') + + return m + +def bcgplvm_linear_stick(kernel=None): + data = GPy.util.datasets.osu_run1() + # optimize + mapping = GPy.mappings.Linear(data['Y'].shape[1], 2) + m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping) + m.optimize(messages=1, max_f_eval=10000) + if GPy.util.visualize.visual_available: + plt.clf + ax = m.plot_latent() + y = m.likelihood.Y[0, :] + data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect']) + lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax) + raw_input('Press enter to finish') + + return m + +def bcgplvm_stick(kernel=None): + data = GPy.util.datasets.osu_run1() + # optimize + back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=5.) + mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel) + m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping) + m.optimize(messages=1, max_f_eval=10000) + if GPy.util.visualize.visual_available: + plt.clf + ax = m.plot_latent() + y = m.likelihood.Y[0, :] + data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect']) + lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax) + raw_input('Press enter to finish') + + return m + +def robot_wireless(): + data = GPy.util.datasets.robot_wireless() + # optimize + m = GPy.models.GPLVM(data['Y'], 2) + m.optimize(messages=1, max_f_eval=10000) + m._set_params(m._get_params()) + plt.clf + ax = m.plot_latent() + + return m + +def stick_bgplvm(model=None): + data = GPy.util.datasets.osu_run1() + Q = 6 + kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2)) + m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel) + # optimize + m.ensure_default_constraints() + m.optimize('scg', messages=1, max_iters=200, xtol=1e-300, ftol=1e-300) + m._set_params(m._get_params()) + plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2) + plt.sca(latent_axes) + m.plot_latent() + y = m.likelihood.Y[0, :].copy() + data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect']) + lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes) + raw_input('Press enter to finish') + + return m + + +def cmu_mocap(subject='35', motion=['01'], in_place=True): + + data = GPy.util.datasets.cmu_mocap(subject, motion) + Y = data['Y'] + if in_place: + # Make figure move in place. + data['Y'][:, 0:3] = 0.0 + m = GPy.models.GPLVM(data['Y'], 2, normalize_Y=True) + + # optimize + m.optimize(messages=1, max_f_eval=10000) + + ax = m.plot_latent() + y = m.likelihood.Y[0, :] + data_show = GPy.util.visualize.skeleton_show(y[None, :], data['skel']) + lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax) + raw_input('Press enter to finish') + lvm_visualizer.close() + + return m + +# def BGPLVM_oil(): +# data = GPy.util.datasets.oil() +# Y, X = data['Y'], data['X'] +# X -= X.mean(axis=0) +# X /= X.std(axis=0) +# +# Q = 10 +# num_inducing = 30 +# +# kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q) +# m = GPy.models.BayesianGPLVM(X, Q, kernel=kernel, num_inducing=num_inducing) +# # m.scale_factor = 100.0 +# m.constrain_positive('(white|noise|bias|X_variance|rbf_variance|rbf_length)') +# from sklearn import cluster +# km = cluster.KMeans(num_inducing, verbose=10) +# Z = km.fit(m.X).cluster_centers_ +# # Z = GPy.util.misc.kmm_init(m.X, num_inducing) +# m.set('iip', Z) +# m.set('bias', 1e-4) +# # optimize +# +# import pdb; pdb.set_trace() +# m.optimize('tnc', messages=1) +# print m +# m.plot_latent(labels=data['Y'].argmax(axis=1)) +# return m + diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py new file mode 100644 index 00000000..c7849a46 --- /dev/null +++ b/GPy/examples/regression.py @@ -0,0 +1,473 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +""" +Gaussian Processes regression examples +""" +import pylab as pb +import numpy as np +import GPy + +def coregionalisation_toy2(max_iters=100): + """ + A simple demonstration of coregionalisation on two sinusoidal functions. + """ + X1 = np.random.rand(50, 1) * 8 + X2 = np.random.rand(30, 1) * 5 + index = np.vstack((np.zeros_like(X1), np.ones_like(X2))) + X = np.hstack((np.vstack((X1, X2)), index)) + Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05 + Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2. + Y = np.vstack((Y1, Y2)) + + k1 = GPy.kern.rbf(1) + GPy.kern.bias(1) + k2 = GPy.kern.coregionalise(2, 1) + k = k1**k2 + m = GPy.models.GPRegression(X, Y, kernel=k) + m.constrain_fixed('.*rbf_var', 1.) + # m.constrain_positive('.*kappa') + m.optimize('sim', messages=1, max_iters=max_iters) + + pb.figure() + Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1)))) + Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1)))) + mean, var, low, up = m.predict(Xtest1) + GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up) + mean, var, low, up = m.predict(Xtest2) + GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up) + pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2) + pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2) + return m + +def coregionalisation_toy(max_iters=100): + """ + A simple demonstration of coregionalisation on two sinusoidal functions. + """ + X1 = np.random.rand(50, 1) * 8 + X2 = np.random.rand(30, 1) * 5 + index = np.vstack((np.zeros_like(X1), np.ones_like(X2))) + X = np.hstack((np.vstack((X1, X2)), index)) + Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05 + Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + Y = np.vstack((Y1, Y2)) + + k1 = GPy.kern.rbf(1) + k2 = GPy.kern.coregionalise(2, 2) + k = k1**k2 #k1.prod(k2, tensor=True) + m = GPy.models.GPRegression(X, Y, kernel=k) + m.constrain_fixed('.*rbf_var', 1.) + # m.constrain_positive('kappa') + m.optimize(max_iters=max_iters) + + pb.figure() + Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1)))) + Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1)))) + mean, var, low, up = m.predict(Xtest1) + GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up) + mean, var, low, up = m.predict(Xtest2) + GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up) + pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2) + pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2) + return m + + +def coregionalisation_sparse(max_iters=100): + """ + A simple demonstration of coregionalisation on two sinusoidal functions using sparse approximations. + """ + X1 = np.random.rand(500, 1) * 8 + X2 = np.random.rand(300, 1) * 5 + index = np.vstack((np.zeros_like(X1), np.ones_like(X2))) + X = np.hstack((np.vstack((X1, X2)), index)) + Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05 + Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + Y = np.vstack((Y1, Y2)) + + num_inducing = 40 + Z = np.hstack((np.random.rand(num_inducing, 1) * 8, np.random.randint(0, 2, num_inducing)[:, None])) + + k1 = GPy.kern.rbf(1) + k2 = GPy.kern.coregionalise(2, 2) + k = k1**k2 #.prod(k2, tensor=True) # + GPy.kern.white(2,0.001) + + m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z) + m.constrain_fixed('.*rbf_var', 1.) + m.constrain_fixed('iip') + m.constrain_bounded('noise_variance', 1e-3, 1e-1) +# m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs') + m.optimize(max_iters=max_iters) + + # plotting: + pb.figure() + Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1)))) + Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1)))) + mean, var, low, up = m.predict(Xtest1) + GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up) + mean, var, low, up = m.predict(Xtest2) + GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up) + pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2) + pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2) + y = pb.ylim()[0] + pb.plot(Z[:, 0][Z[:, 1] == 0], np.zeros(np.sum(Z[:, 1] == 0)) + y, 'r|', mew=2) + pb.plot(Z[:, 0][Z[:, 1] == 1], np.zeros(np.sum(Z[:, 1] == 1)) + y, 'g|', mew=2) + return m + +def epomeo_gpx(max_iters=100): + """Perform Gaussian process regression on the latitude and longitude data from the Mount Epomeo runs. Requires gpxpy to be installed on your system to load in the data.""" + data = GPy.util.datasets.epomeo_gpx() + num_data_list = [] + for Xpart in data['X']: + num_data_list.append(Xpart.shape[0]) + + num_data_array = np.array(num_data_list) + num_data = num_data_array.sum() + Y = np.zeros((num_data, 2)) + t = np.zeros((num_data, 2)) + start = 0 + for Xpart, index in zip(data['X'], range(len(data['X']))): + end = start+Xpart.shape[0] + t[start:end, :] = np.hstack((Xpart[:, 0:1], + index*np.ones((Xpart.shape[0], 1)))) + Y[start:end, :] = Xpart[:, 1:3] + + num_inducing = 200 + Z = np.hstack((np.linspace(t[:,0].min(), t[:, 0].max(), num_inducing)[:, None], + np.random.randint(0, 4, num_inducing)[:, None])) + + k1 = GPy.kern.rbf(1) + k2 = GPy.kern.coregionalise(output_dim=5, rank=5) + k = k1**k2 + + m = GPy.models.SparseGPRegression(t, Y, kernel=k, Z=Z, normalize_Y=True) + m.constrain_fixed('.*rbf_var', 1.) + m.constrain_fixed('iip') + m.constrain_bounded('noise_variance', 1e-3, 1e-1) +# m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs') + m.optimize(max_iters=max_iters,messages=True) + + return m + + +def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=10000, max_iters=300): + """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisy mode is higher.""" + + # Contour over a range of length scales and signal/noise ratios. + length_scales = np.linspace(0.1, 60., resolution) + log_SNRs = np.linspace(-3., 4., resolution) + + data = GPy.util.datasets.della_gatta_TRP63_gene_expression(gene_number) + # data['Y'] = data['Y'][0::2, :] + # data['X'] = data['X'][0::2, :] + + data['Y'] = data['Y'] - np.mean(data['Y']) + + lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.rbf) + pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet) + ax = pb.gca() + pb.xlabel('length scale') + pb.ylabel('log_10 SNR') + + xlim = ax.get_xlim() + ylim = ax.get_ylim() + + # Now run a few optimizations + models = [] + optim_point_x = np.empty(2) + optim_point_y = np.empty(2) + np.random.seed(seed=seed) + for i in range(0, model_restarts): + # kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.)) + kern = GPy.kern.rbf(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50)) + + m = GPy.models.GPRegression(data['X'], data['Y'], kernel=kern) + m['noise_variance'] = np.random.uniform(1e-3, 1) + optim_point_x[0] = m['rbf_lengthscale'] + optim_point_y[0] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']); + + # optimize + m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters) + + optim_point_x[1] = m['rbf_lengthscale'] + optim_point_y[1] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']); + + pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k') + models.append(m) + + ax.set_xlim(xlim) + ax.set_ylim(ylim) + return m # (models, lls) + +def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf): + """Evaluate the GP objective function for a given data set for a range of signal to noise ratios and a range of lengthscales. + + :data_set: A data set from the utils.datasets director. + :length_scales: a list of length scales to explore for the contour plot. + :log_SNRs: a list of base 10 logarithm signal to noise ratios to explore for the contour plot. + :kernel: a kernel to use for the 'signal' portion of the data.""" + + lls = [] + total_var = np.var(data['Y']) + kernel = kernel_call(1, variance=1., lengthscale=1.) + model = GPy.models.GPRegression(data['X'], data['Y'], kernel=kernel) + for log_SNR in log_SNRs: + SNR = 10.**log_SNR + noise_var = total_var / (1. + SNR) + signal_var = total_var - noise_var + model.kern['.*variance'] = signal_var + model['noise_variance'] = noise_var + length_scale_lls = [] + + for length_scale in length_scales: + model['.*lengthscale'] = length_scale + length_scale_lls.append(model.log_likelihood()) + + lls.append(length_scale_lls) + + return np.array(lls) + + +def olympic_100m_men(max_iters=100, kernel=None): + """Run a standard Gaussian process regression on the Rogers and Girolami olympics data.""" + data = GPy.util.datasets.olympic_100m_men() + + # create simple GP Model + m = GPy.models.GPRegression(data['X'], data['Y'], kernel) + + # set the lengthscale to be something sensible (defaults to 1) + if kernel==None: + m['rbf_lengthscale'] = 10 + + # optimize + m.optimize(max_iters=max_iters) + + # plot + m.plot(plot_limits=(1850, 2050)) + print(m) + return m + +def olympic_marathon_men(max_iters=100, kernel=None): + """Run a standard Gaussian process regression on the Olympic marathon data.""" + data = GPy.util.datasets.olympic_marathon_men() + + # create simple GP Model + m = GPy.models.GPRegression(data['X'], data['Y'], kernel) + + # set the lengthscale to be something sensible (defaults to 1) + if kernel==None: + m['rbf_lengthscale'] = 10 + + # optimize + m.optimize(max_iters=max_iters) + + # plot + m.plot(plot_limits=(1850, 2050)) + print(m) + return m + +def toy_rbf_1d(optimizer='tnc', max_nb_eval_optim=100): + """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance.""" + data = GPy.util.datasets.toy_rbf_1d() + + # create simple GP Model + m = GPy.models.GPRegression(data['X'], data['Y']) + + # optimize + m.optimize(optimizer, max_f_eval=max_nb_eval_optim) + # plot + m.plot() + print(m) + return m + +def toy_rbf_1d_50(max_iters=100): + """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance.""" + data = GPy.util.datasets.toy_rbf_1d_50() + + # create simple GP Model + m = GPy.models.GPRegression(data['X'], data['Y']) + + # optimize + m.optimize(max_iters=max_iters) + + # plot + m.plot() + print(m) + return m + +def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4): + # Create an artificial dataset where the values in the targets (Y) + # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to + # see if this dependency can be recovered + X1 = np.sin(np.sort(np.random.rand(num_samples, 1) * 10, 0)) + X2 = np.cos(np.sort(np.random.rand(num_samples, 1) * 10, 0)) + X3 = np.exp(np.sort(np.random.rand(num_samples, 1), 0)) + X4 = np.log(np.sort(np.random.rand(num_samples, 1), 0)) + X = np.hstack((X1, X2, X3, X4)) + + Y1 = np.asarray(2 * X[:, 0] + 3).reshape(-1, 1) + Y2 = np.asarray(4 * (X[:, 2] - 1.5 * X[:, 0])).reshape(-1, 1) + Y = np.hstack((Y1, Y2)) + + Y = np.dot(Y, np.random.rand(2, D)); + Y = Y + 0.2 * np.random.randn(Y.shape[0], Y.shape[1]) + Y -= Y.mean() + Y /= Y.std() + + if kernel_type == 'linear': + kernel = GPy.kern.linear(X.shape[1], ARD=1) + elif kernel_type == 'rbf_inv': + kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1) + else: + kernel = GPy.kern.rbf(X.shape[1], ARD=1) + kernel += GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1]) + m = GPy.models.GPRegression(X, Y, kernel) + # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25 + # m.set_prior('.*lengthscale',len_prior) + + m.optimize(optimizer='scg', max_iters=max_iters, messages=1) + + m.kern.plot_ARD() + print(m) + return m + +def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4): + # Create an artificial dataset where the values in the targets (Y) + # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to + # see if this dependency can be recovered + X1 = np.sin(np.sort(np.random.rand(num_samples, 1) * 10, 0)) + X2 = np.cos(np.sort(np.random.rand(num_samples, 1) * 10, 0)) + X3 = np.exp(np.sort(np.random.rand(num_samples, 1), 0)) + X4 = np.log(np.sort(np.random.rand(num_samples, 1), 0)) + X = np.hstack((X1, X2, X3, X4)) + + Y1 = np.asarray(2 * X[:, 0] + 3)[:, None] + Y2 = np.asarray(4 * (X[:, 2] - 1.5 * X[:, 0]))[:, None] + Y = np.hstack((Y1, Y2)) + + Y = np.dot(Y, np.random.rand(2, D)); + Y = Y + 0.2 * np.random.randn(Y.shape[0], Y.shape[1]) + Y -= Y.mean() + Y /= Y.std() + + if kernel_type == 'linear': + kernel = GPy.kern.linear(X.shape[1], ARD=1) + elif kernel_type == 'rbf_inv': + kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1) + else: + kernel = GPy.kern.rbf(X.shape[1], ARD=1) + kernel += GPy.kern.bias(X.shape[1]) + X_variance = np.ones(X.shape) * 0.5 + m = GPy.models.SparseGPRegression(X, Y, kernel, X_variance=X_variance) + # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25 + # m.set_prior('.*lengthscale',len_prior) + + m.optimize(optimizer='scg', max_iters=max_iters, messages=1) + + m.kern.plot_ARD() + print(m) + return m + +def robot_wireless(max_iters=100, kernel=None): + """Predict the location of a robot given wirelss signal strength readings.""" + data = GPy.util.datasets.robot_wireless() + + # create simple GP Model + m = GPy.models.GPRegression(data['Y'], data['X'], kernel=kernel) + + # optimize + m.optimize(messages=True, max_iters=max_iters) + Xpredict = m.predict(data['Ytest'])[0] + pb.plot(data['Xtest'][:, 0], data['Xtest'][:, 1], 'r-') + pb.plot(Xpredict[:, 0], Xpredict[:, 1], 'b-') + pb.axis('equal') + pb.title('WiFi Localization with Gaussian Processes') + pb.legend(('True Location', 'Predicted Location')) + + sse = ((data['Xtest'] - Xpredict)**2).sum() + print(m) + print('Sum of squares error on test data: ' + str(sse)) + return m + +def silhouette(max_iters=100): + """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper.""" + data = GPy.util.datasets.silhouette() + + # create simple GP Model + m = GPy.models.GPRegression(data['X'], data['Y']) + + # optimize + m.optimize(messages=True, max_iters=max_iters) + + print(m) + return m + + + +def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100): + """Run a 1D example of a sparse GP regression.""" + # sample inputs and outputs + X = np.random.uniform(-3., 3., (num_samples, 1)) + Y = np.sin(X) + np.random.randn(num_samples, 1) * 0.05 + # construct kernel + rbf = GPy.kern.rbf(1) + # create simple GP Model + m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing) + + + m.checkgrad(verbose=1) + m.optimize('tnc', messages=1, max_iters=max_iters) + m.plot() + return m + +def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100): + """Run a 2D example of a sparse GP regression.""" + X = np.random.uniform(-3., 3., (num_samples, 2)) + Y = np.sin(X[:, 0:1]) * np.sin(X[:, 1:2]) + np.random.randn(num_samples, 1) * 0.05 + + # construct kernel + rbf = GPy.kern.rbf(2) + + # create simple GP Model + m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing) + + # contrain all parameters to be positive (but not inducing inputs) + m['.*len'] = 2. + + m.checkgrad() + + # optimize and plot + m.optimize('tnc', messages=1, max_iters=max_iters) + m.plot() + print(m) + return m + +def uncertain_inputs_sparse_regression(max_iters=100): + """Run a 1D example of a sparse GP regression with uncertain inputs.""" + fig, axes = pb.subplots(1, 2, figsize=(12, 5)) + + # sample inputs and outputs + S = np.ones((20, 1)) + X = np.random.uniform(-3., 3., (20, 1)) + Y = np.sin(X) + np.random.randn(20, 1) * 0.05 + # likelihood = GPy.likelihoods.Gaussian(Y) + Z = np.random.uniform(-3., 3., (7, 1)) + + k = GPy.kern.rbf(1) + + # create simple GP Model - no input uncertainty on this one + m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z) + m.optimize('scg', messages=1, max_iters=max_iters) + m.plot(ax=axes[0]) + axes[0].set_title('no input uncertainty') + + + # the same Model with uncertainty + m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z, X_variance=S) + m.optimize('scg', messages=1, max_iters=max_iters) + m.plot(ax=axes[1]) + axes[1].set_title('with input uncertainty') + print(m) + + fig.canvas.draw() + + return m diff --git a/GPy/examples/stochastic.py b/GPy/examples/stochastic.py new file mode 100644 index 00000000..21011901 --- /dev/null +++ b/GPy/examples/stochastic.py @@ -0,0 +1,41 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import pylab as pb +import numpy as np +import GPy + +def toy_1d(): + N = 2000 + M = 20 + + #create data + X = np.linspace(0,32,N)[:,None] + Z = np.linspace(0,32,M)[:,None] + Y = np.sin(X) + np.cos(0.3*X) + np.random.randn(*X.shape)/np.sqrt(50.) + + m = GPy.models.SVIGPRegression(X,Y, batchsize=10, Z=Z) + m.constrain_bounded('noise_variance',1e-3,1e-1) + m.constrain_bounded('white_variance',1e-3,1e-1) + + m.param_steplength = 1e-4 + + fig = pb.figure() + ax = fig.add_subplot(111) + def cb(): + ax.cla() + m.plot(ax=ax,Z_height=-3) + ax.set_ylim(-3,3) + fig.canvas.draw() + + m.optimize(500, callback=cb, callback_interval=1) + + m.plot_traces() + return m + + + + + + + diff --git a/GPy/examples/tutorials.py b/GPy/examples/tutorials.py new file mode 100644 index 00000000..69fc2aaf --- /dev/null +++ b/GPy/examples/tutorials.py @@ -0,0 +1,146 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +""" +Code of Tutorials +""" + +import pylab as pb +pb.ion() +import numpy as np +import GPy + +def tuto_GP_regression(): + """The detailed explanations of the commands used in this file can be found in the tutorial section""" + + X = np.random.uniform(-3.,3.,(20,1)) + Y = np.sin(X) + np.random.randn(20,1)*0.05 + + kernel = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.) + + m = GPy.models.GPRegression(X, Y, kernel) + + print m + m.plot() + + m.constrain_positive('') + + m.unconstrain('') # may be used to remove the previous constrains + m.constrain_positive('.*rbf_variance') + m.constrain_bounded('.*lengthscale',1.,10. ) + m.constrain_fixed('.*noise',0.0025) + + m.optimize() + + m.optimize_restarts(num_restarts = 10) + + ####################################################### + ####################################################### + # sample inputs and outputs + X = np.random.uniform(-3.,3.,(50,2)) + Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(50,1)*0.05 + + # define kernel + ker = GPy.kern.Matern52(2,ARD=True) + GPy.kern.white(2) + + # create simple GP model + m = GPy.models.GPRegression(X, Y, ker) + + # contrain all parameters to be positive + m.constrain_positive('') + + # optimize and plot + m.optimize('tnc', max_f_eval = 1000) + m.plot() + print(m) + return(m) + +def tuto_kernel_overview(): + """The detailed explanations of the commands used in this file can be found in the tutorial section""" + ker1 = GPy.kern.rbf(1) # Equivalent to ker1 = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.) + ker2 = GPy.kern.rbf(input_dim=1, variance = .75, lengthscale=2.) + ker3 = GPy.kern.rbf(1, .5, .5) + + print ker2 + + ker1.plot() + ker2.plot() + ker3.plot() + + k1 = GPy.kern.rbf(1,1.,2.) + k2 = GPy.kern.Matern32(1, 0.5, 0.2) + + # Product of kernels + k_prod = k1.prod(k2) # By default, tensor=False + k_prodtens = k1.prod(k2,tensor=True) + + # Sum of kernels + k_add = k1.add(k2) # By default, tensor=False + k_addtens = k1.add(k2,tensor=True) + + k1 = GPy.kern.rbf(1,1.,2) + k2 = GPy.kern.periodic_Matern52(1,variance=1e3, lengthscale=1, period = 1.5, lower=-5., upper = 5) + + k = k1 * k2 # equivalent to k = k1.prod(k2) + print k + + # Simulate sample paths + X = np.linspace(-5,5,501)[:,None] + Y = np.random.multivariate_normal(np.zeros(501),k.K(X),1) + + k1 = GPy.kern.rbf(1) + k2 = GPy.kern.Matern32(1) + k3 = GPy.kern.white(1) + + k = k1 + k2 + k3 + print k + + k.constrain_positive('.*var') + k.constrain_fixed(np.array([1]),1.75) + k.tie_params('.*len') + k.unconstrain('white') + k.constrain_bounded('white',lower=1e-5,upper=.5) + print k + + k_cst = GPy.kern.bias(1,variance=1.) + k_mat = GPy.kern.Matern52(1,variance=1., lengthscale=3) + Kanova = (k_cst + k_mat).prod(k_cst + k_mat,tensor=True) + print Kanova + + # sample inputs and outputs + X = np.random.uniform(-3.,3.,(40,2)) + Y = 0.5*X[:,:1] + 0.5*X[:,1:] + 2*np.sin(X[:,:1]) * np.sin(X[:,1:]) + + # Create GP regression model + m = GPy.models.GPRegression(X, Y, Kanova) + fig = pb.figure(figsize=(5,5)) + ax = fig.add_subplot(111) + m.plot(ax=ax) + + pb.figure(figsize=(20,3)) + pb.subplots_adjust(wspace=0.5) + axs = pb.subplot(1,5,1) + m.plot(ax=axs) + pb.subplot(1,5,2) + pb.ylabel("= ",rotation='horizontal',fontsize='30') + axs = pb.subplot(1,5,3) + m.plot(ax=axs, which_parts=[False,True,False,False]) + pb.ylabel("cst +",rotation='horizontal',fontsize='30') + axs = pb.subplot(1,5,4) + m.plot(ax=axs, which_parts=[False,False,True,False]) + pb.ylabel("+ ",rotation='horizontal',fontsize='30') + axs = pb.subplot(1,5,5) + pb.ylabel("+ ",rotation='horizontal',fontsize='30') + m.plot(ax=axs, which_parts=[False,False,False,True]) + + return(m) + + +def model_interaction(): + X = np.random.randn(20,1) + Y = np.sin(X) + np.random.randn(*X.shape)*0.01 + 5. + k = GPy.kern.rbf(1) + GPy.kern.bias(1) + m = GPy.models.GPRegression(X, Y, kernel=k) + return m + diff --git a/GPy/inference/__init__.py b/GPy/inference/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/GPy/inference/conjugate_gradient_descent.py b/GPy/inference/conjugate_gradient_descent.py new file mode 100644 index 00000000..0f6603e5 --- /dev/null +++ b/GPy/inference/conjugate_gradient_descent.py @@ -0,0 +1,290 @@ +''' +Created on 24 Apr 2013 + +@author: maxz +''' +from GPy.inference.gradient_descent_update_rules import FletcherReeves, \ + PolakRibiere +from Queue import Empty +from multiprocessing import Value +from multiprocessing.queues import Queue +from multiprocessing.synchronize import Event +from scipy.optimize.linesearch import line_search_wolfe1, line_search_wolfe2 +from threading import Thread +import numpy +import sys +import time + +RUNNING = "running" +CONVERGED = "converged" +MAXITER = "maximum number of iterations reached" +MAX_F_EVAL = "maximum number of function calls reached" +LINE_SEARCH = "line search failed" +KBINTERRUPT = "interrupted" + +class _Async_Optimization(Thread): + + def __init__(self, f, df, x0, update_rule, runsignal, SENTINEL, + report_every=10, messages=0, maxiter=5e3, max_f_eval=15e3, + gtol=1e-6, outqueue=None, *args, **kw): + """ + Helper Process class for async optimization + + f_call and df_call are Multiprocessing Values, for synchronized assignment + """ + self.f_call = Value('i', 0) + self.df_call = Value('i', 0) + self.f = self.f_wrapper(f, self.f_call) + self.df = self.f_wrapper(df, self.df_call) + self.x0 = x0 + self.update_rule = update_rule + self.report_every = report_every + self.messages = messages + self.maxiter = maxiter + self.max_f_eval = max_f_eval + self.gtol = gtol + self.SENTINEL = SENTINEL + self.runsignal = runsignal +# self.parent = parent +# self.result = None + self.outq = outqueue + super(_Async_Optimization, self).__init__(target=self.run, + name="CG Optimization", + *args, **kw) + +# def __enter__(self): +# return self +# +# def __exit__(self, type, value, traceback): +# return isinstance(value, TypeError) + + def f_wrapper(self, f, counter): + def f_w(*a, **kw): + counter.value += 1 + return f(*a, **kw) + return f_w + + def callback(self, *a): + if self.outq is not None: + self.outq.put(a) +# self.parent and self.parent.callback(*a, **kw) + pass + # print "callback done" + + def callback_return(self, *a): + self.callback(*a) + if self.outq is not None: + self.outq.put(self.SENTINEL) + if self.messages: + print "" + self.runsignal.clear() + + def run(self, *args, **kwargs): + raise NotImplementedError("Overwrite this with optimization (for async use)") + pass + +class _CGDAsync(_Async_Optimization): + + def reset(self, xi, *a, **kw): + gi = -self.df(xi, *a, **kw) + si = gi + ur = self.update_rule(gi) + return gi, ur, si + + def run(self, *a, **kw): + status = RUNNING + + fi = self.f(self.x0) + fi_old = fi + 5000 + + gi, ur, si = self.reset(self.x0, *a, **kw) + xi = self.x0 + xi_old = numpy.nan + it = 0 + + while it < self.maxiter: + if not self.runsignal.is_set(): + break + + if self.f_call.value > self.max_f_eval: + status = MAX_F_EVAL + + gi = -self.df(xi, *a, **kw) + if numpy.dot(gi.T, gi) <= self.gtol: + status = CONVERGED + break + if numpy.isnan(numpy.dot(gi.T, gi)): + if numpy.any(numpy.isnan(xi_old)): + status = CONVERGED + break + self.reset(xi_old) + + gammai = ur(gi) + if gammai < 1e-6 or it % xi.shape[0] == 0: + gi, ur, si = self.reset(xi, *a, **kw) + si = gi + gammai * si + alphai, _, _, fi2, fi_old2, gfi = line_search_wolfe1(self.f, + self.df, + xi, + si, gi, + fi, fi_old) + if alphai is None: + alphai, _, _, fi2, fi_old2, gfi = \ + line_search_wolfe2(self.f, self.df, + xi, si, gi, + fi, fi_old) + if alphai is None: + # This line search also failed to find a better solution. + status = LINE_SEARCH + break + if fi2 < fi: + fi, fi_old = fi2, fi_old2 + if gfi is not None: + gi = gfi + + if numpy.isnan(fi) or fi_old < fi: + gi, ur, si = self.reset(xi, *a, **kw) + + else: + xi += numpy.dot(alphai, si) + if self.messages: + sys.stdout.write("\r") + sys.stdout.flush() + sys.stdout.write("iteration: {0:> 6g} f:{1:> 12e} |g|:{2:> 12e}".format(it, fi, numpy.dot(gi.T, gi))) + + if it % self.report_every == 0: + self.callback(xi, fi, gi, it, self.f_call.value, self.df_call.value, status) + it += 1 + else: + status = MAXITER + self.callback_return(xi, fi, gi, it, self.f_call.value, self.df_call.value, status) + self.result = [xi, fi, gi, it, self.f_call.value, self.df_call.value, status] + +class Async_Optimize(object): + callback = lambda *x: None + runsignal = Event() + SENTINEL = "SENTINEL" + + def async_callback_collect(self, q): + while self.runsignal.is_set(): + try: + for ret in iter(lambda: q.get(timeout=1), self.SENTINEL): + self.callback(*ret) + self.runsignal.clear() + except Empty: + pass + + def opt_async(self, f, df, x0, callback, update_rule=PolakRibiere, + messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6, + report_every=10, *args, **kwargs): + self.runsignal.set() + c = None + outqueue = None + if callback: + outqueue = Queue() + self.callback = callback + c = Thread(target=self.async_callback_collect, args=(outqueue,)) + c.start() + p = _CGDAsync(f, df, x0, update_rule, self.runsignal, self.SENTINEL, + report_every=report_every, messages=messages, maxiter=maxiter, + max_f_eval=max_f_eval, gtol=gtol, outqueue=outqueue, *args, **kwargs) + p.start() + return p, c + + def opt(self, f, df, x0, callback=None, update_rule=FletcherReeves, + messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6, + report_every=10, *args, **kwargs): + p, c = self.opt_async(f, df, x0, callback, update_rule, messages, + maxiter, max_f_eval, gtol, + report_every, *args, **kwargs) + while self.runsignal.is_set(): + try: + p.join(1) + if c: c.join(1) + except KeyboardInterrupt: + # print "^C" + self.runsignal.clear() + p.join() + if c: c.join() + if c and c.is_alive(): +# self.runsignal.set() +# while self.runsignal.is_set(): +# try: +# c.join(.1) +# except KeyboardInterrupt: +# # print "^C" +# self.runsignal.clear() +# c.join() + print "WARNING: callback still running, optimisation done!" + return p.result + +class CGD(Async_Optimize): + ''' + Conjugate gradient descent algorithm to minimize + function f with gradients df, starting at x0 + with update rule update_rule + + if df returns tuple (grad, natgrad) it will optimize according + to natural gradient rules + ''' + opt_name = "Conjugate Gradient Descent" + + def opt_async(self, *a, **kw): + """ + opt_async(self, f, df, x0, callback, update_rule=FletcherReeves, + messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6, + report_every=10, *args, **kwargs) + + callback gets called every `report_every` iterations + + callback(xi, fi, gi, iteration, function_calls, gradient_calls, status_message) + + if df returns tuple (grad, natgrad) it will optimize according + to natural gradient rules + + f, and df will be called with + + f(xi, *args, **kwargs) + df(xi, *args, **kwargs) + + **returns** + ----------- + + Started `Process` object, optimizing asynchronously + + **calls** + --------- + + callback(x_opt, f_opt, g_opt, iteration, function_calls, gradient_calls, status_message) + + at end of optimization! + """ + return super(CGD, self).opt_async(*a, **kw) + + def opt(self, *a, **kw): + """ + opt(self, f, df, x0, callback=None, update_rule=FletcherReeves, + messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6, + report_every=10, *args, **kwargs) + + Minimize f, calling callback every `report_every` iterations with following syntax: + + callback(xi, fi, gi, iteration, function_calls, gradient_calls, status_message) + + if df returns tuple (grad, natgrad) it will optimize according + to natural gradient rules + + f, and df will be called with + + f(xi, *args, **kwargs) + df(xi, *args, **kwargs) + + **returns** + --------- + + x_opt, f_opt, g_opt, iteration, function_calls, gradient_calls, status_message + + at end of optimization + """ + return super(CGD, self).opt(*a, **kw) + diff --git a/GPy/inference/gradient_descent_update_rules.py b/GPy/inference/gradient_descent_update_rules.py new file mode 100644 index 00000000..1c14ed63 --- /dev/null +++ b/GPy/inference/gradient_descent_update_rules.py @@ -0,0 +1,53 @@ +''' +Created on 24 Apr 2013 + +@author: maxz +''' +import numpy + +class GDUpdateRule(): + _gradnat = None + _gradnatold = None + def __init__(self, initgrad, initgradnat=None): + self.grad = initgrad + if initgradnat: + self.gradnat = initgradnat + else: + self.gradnat = initgrad + # self.grad, self.gradnat + def _gamma(self): + raise NotImplemented("""Implement gamma update rule here, + you can use self.grad and self.gradold for parameters, as well as + self.gradnat and self.gradnatold for natural gradients.""") + def __call__(self, grad, gradnat=None, si=None, *args, **kw): + """ + Return gamma for given gradients and optional natural gradients + """ + if not gradnat: + gradnat = grad + self.gradold = self.grad + self.gradnatold = self.gradnat + self.grad = grad + self.gradnat = gradnat + self.si = si + return self._gamma(*args, **kw) + +class FletcherReeves(GDUpdateRule): + ''' + Fletcher Reeves update rule for gamma + ''' + def _gamma(self, *a, **kw): + tmp = numpy.dot(self.grad.T, self.gradnat) + if tmp: + return tmp / numpy.dot(self.gradold.T, self.gradnatold) + return tmp + +class PolakRibiere(GDUpdateRule): + ''' + Fletcher Reeves update rule for gamma + ''' + def _gamma(self, *a, **kw): + tmp = numpy.dot((self.grad - self.gradold).T, self.gradnat) + if tmp: + return tmp / numpy.dot(self.gradold.T, self.gradnatold) + return tmp diff --git a/GPy/inference/optimization.py b/GPy/inference/optimization.py new file mode 100644 index 00000000..0ef487af --- /dev/null +++ b/GPy/inference/optimization.py @@ -0,0 +1,240 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import pylab as pb +import datetime as dt +from scipy import optimize +from warnings import warn + +try: + import rasmussens_minimize as rasm + rasm_available = True +except ImportError: + rasm_available = False +from scg import SCG + +class Optimizer(): + """ + Superclass for all the optimizers. + + :param x_init: initial set of parameters + :param f_fp: function that returns the function AND the gradients at the same time + :param f: function to optimize + :param fp: gradients + :param messages: print messages from the optimizer? + :type messages: (True | False) + :param max_f_eval: maximum number of function evaluations + + :rtype: optimizer object. + + """ + def __init__(self, x_init, messages=False, model=None, max_f_eval=1e4, max_iters=1e3, + ftol=None, gtol=None, xtol=None): + self.opt_name = None + self.x_init = x_init + self.messages = messages + self.f_opt = None + self.x_opt = None + self.funct_eval = None + self.status = None + self.max_f_eval = int(max_f_eval) + self.max_iters = int(max_iters) + self.trace = None + self.time = "Not available" + self.xtol = xtol + self.gtol = gtol + self.ftol = ftol + self.model = model + + def run(self, **kwargs): + start = dt.datetime.now() + self.opt(**kwargs) + end = dt.datetime.now() + self.time = str(end - start) + + def opt(self, f_fp=None, f=None, fp=None): + raise NotImplementedError, "this needs to be implemented to use the optimizer class" + + def plot(self): + if self.trace == None: + print "No trace present so I can't plot it. Please check that the optimizer actually supplies a trace." + else: + pb.figure() + pb.plot(self.trace) + pb.xlabel('Iteration') + pb.ylabel('f(x)') + + def __str__(self): + diagnostics = "Optimizer: \t\t\t\t %s\n" % self.opt_name + diagnostics += "f(x_opt): \t\t\t\t %.3f\n" % self.f_opt + diagnostics += "Number of function evaluations: \t %d\n" % self.funct_eval + diagnostics += "Optimization status: \t\t\t %s\n" % self.status + diagnostics += "Time elapsed: \t\t\t\t %s\n" % self.time + return diagnostics + +class opt_tnc(Optimizer): + def __init__(self, *args, **kwargs): + Optimizer.__init__(self, *args, **kwargs) + self.opt_name = "TNC (Scipy implementation)" + + def opt(self, f_fp=None, f=None, fp=None): + """ + Run the TNC optimizer + + """ + tnc_rcstrings = ['Local minimum', 'Converged', 'XConverged', 'Maximum number of f evaluations reached', + 'Line search failed', 'Function is constant'] + + assert f_fp != None, "TNC requires f_fp" + + opt_dict = {} + if self.xtol is not None: + opt_dict['xtol'] = self.xtol + if self.ftol is not None: + opt_dict['ftol'] = self.ftol + if self.gtol is not None: + opt_dict['pgtol'] = self.gtol + + opt_result = optimize.fmin_tnc(f_fp, self.x_init, messages=self.messages, + maxfun=self.max_f_eval, **opt_dict) + self.x_opt = opt_result[0] + self.f_opt = f_fp(self.x_opt)[0] + self.funct_eval = opt_result[1] + self.status = tnc_rcstrings[opt_result[2]] + +class opt_lbfgsb(Optimizer): + def __init__(self, *args, **kwargs): + Optimizer.__init__(self, *args, **kwargs) + self.opt_name = "L-BFGS-B (Scipy implementation)" + + def opt(self, f_fp=None, f=None, fp=None): + """ + Run the optimizer + + """ + rcstrings = ['Converged', 'Maximum number of f evaluations reached', 'Error'] + + assert f_fp != None, "BFGS requires f_fp" + + if self.messages: + iprint = 1 + else: + iprint = -1 + + opt_dict = {} + if self.xtol is not None: + print "WARNING: l-bfgs-b doesn't have an xtol arg, so I'm going to ignore it" + if self.ftol is not None: + print "WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it" + if self.gtol is not None: + opt_dict['pgtol'] = self.gtol + + opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint=iprint, + maxfun=self.max_f_eval, **opt_dict) + self.x_opt = opt_result[0] + self.f_opt = f_fp(self.x_opt)[0] + self.funct_eval = opt_result[2]['funcalls'] + self.status = rcstrings[opt_result[2]['warnflag']] + +class opt_simplex(Optimizer): + def __init__(self, *args, **kwargs): + Optimizer.__init__(self, *args, **kwargs) + self.opt_name = "Nelder-Mead simplex routine (via Scipy)" + + def opt(self, f_fp=None, f=None, fp=None): + """ + The simplex optimizer does not require gradients. + """ + + statuses = ['Converged', 'Maximum number of function evaluations made', 'Maximum number of iterations reached'] + + opt_dict = {} + if self.xtol is not None: + opt_dict['xtol'] = self.xtol + if self.ftol is not None: + opt_dict['ftol'] = self.ftol + if self.gtol is not None: + print "WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it" + + opt_result = optimize.fmin(f, self.x_init, (), disp=self.messages, + maxfun=self.max_f_eval, full_output=True, **opt_dict) + + self.x_opt = opt_result[0] + self.f_opt = opt_result[1] + self.funct_eval = opt_result[3] + self.status = statuses[opt_result[4]] + self.trace = None + + +class opt_rasm(Optimizer): + def __init__(self, *args, **kwargs): + Optimizer.__init__(self, *args, **kwargs) + self.opt_name = "Rasmussen's Conjugate Gradient" + + def opt(self, f_fp=None, f=None, fp=None): + """ + Run Rasmussen's Conjugate Gradient optimizer + """ + + assert f_fp != None, "Rasmussen's minimizer requires f_fp" + statuses = ['Converged', 'Line search failed', 'Maximum number of f evaluations reached', + 'NaNs in optimization'] + + opt_dict = {} + if self.xtol is not None: + print "WARNING: minimize doesn't have an xtol arg, so I'm going to ignore it" + if self.ftol is not None: + print "WARNING: minimize doesn't have an ftol arg, so I'm going to ignore it" + if self.gtol is not None: + print "WARNING: minimize doesn't have an gtol arg, so I'm going to ignore it" + + opt_result = rasm.minimize(self.x_init, f_fp, (), messages=self.messages, + maxnumfuneval=self.max_f_eval) + self.x_opt = opt_result[0] + self.f_opt = opt_result[1][-1] + self.funct_eval = opt_result[2] + self.status = statuses[opt_result[3]] + + self.trace = opt_result[1] + +class opt_SCG(Optimizer): + def __init__(self, *args, **kwargs): + if 'max_f_eval' in kwargs: + warn("max_f_eval deprecated for SCG optimizer: use max_iters instead!\nIgnoring max_f_eval!", FutureWarning) + Optimizer.__init__(self, *args, **kwargs) + + self.opt_name = "Scaled Conjugate Gradients" + + def opt(self, f_fp=None, f=None, fp=None): + assert not f is None + assert not fp is None + + opt_result = SCG(f, fp, self.x_init, display=self.messages, + maxiters=self.max_iters, + max_f_eval=self.max_f_eval, + xtol=self.xtol, ftol=self.ftol, + gtol=self.gtol) + + self.x_opt = opt_result[0] + self.trace = opt_result[1] + self.f_opt = self.trace[-1] + self.funct_eval = opt_result[2] + self.status = opt_result[3] + +def get_optimizer(f_min): + from sgd import opt_SGD + + optimizers = {'fmin_tnc': opt_tnc, + 'simplex': opt_simplex, + 'lbfgsb': opt_lbfgsb, + 'scg': opt_SCG, + 'sgd': opt_SGD} + + if rasm_available: + optimizers['rasmussen'] = opt_rasm + + for opt_name in optimizers.keys(): + if opt_name.lower().find(f_min.lower()) != -1: + return optimizers[opt_name] + + raise KeyError('No optimizer was found matching the name: %s' % f_min) diff --git a/GPy/inference/samplers.py b/GPy/inference/samplers.py new file mode 100644 index 00000000..c2b47bce --- /dev/null +++ b/GPy/inference/samplers.py @@ -0,0 +1,85 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +from scipy import linalg, optimize +import pylab as pb +import Tango +import sys +import re +import numdifftools as ndt +import pdb +import cPickle + + +class Metropolis_Hastings: + def __init__(self,model,cov=None): + """Metropolis Hastings, with tunings according to Gelman et al. """ + self.model = model + current = self.model._get_params_transformed() + self.D = current.size + self.chains = [] + if cov is None: + self.cov = model.Laplace_covariance() + else: + self.cov = cov + self.scale = 2.4/np.sqrt(self.D) + self.new_chain(current) + + def new_chain(self, start=None): + self.chains.append([]) + if start is None: + self.model.randomize() + else: + self.model._set_params_transformed(start) + + + + def sample(self, Ntotal, Nburn, Nthin, tune=True, tune_throughout=False, tune_interval=400): + current = self.model._get_params_transformed() + fcurrent = self.model.log_likelihood() + self.model.log_prior() + accepted = np.zeros(Ntotal,dtype=np.bool) + for it in range(Ntotal): + print "sample %d of %d\r"%(it,Ntotal), + sys.stdout.flush() + prop = np.random.multivariate_normal(current, self.cov*self.scale*self.scale) + self.model._set_params_transformed(prop) + fprop = self.model.log_likelihood() + self.model.log_prior() + + if fprop>fcurrent:#sample accepted, going 'uphill' + accepted[it] = True + current = prop + fcurrent = fprop + else: + u = np.random.rand() + if np.exp(fprop-fcurrent)>u:#sample accepted downhill + accepted[it] = True + current = prop + fcurrent = fprop + + #store current value + if (it > Nburn) & ((it%Nthin)==0): + self.chains[-1].append(current) + + #tuning! + if it & ((it%tune_interval)==0) & tune & ((it .25: + self.scale *= 1.1 + if pc < .15: + self.scale /= 1.1 + + def predict(self,function,args): + """Make a prediction for the function, to which we will pass the additional arguments""" + param = self.model._get_params() + fs = [] + for p in self.chain: + self.model._set_params(p) + fs.append(function(*args)) + self.model._set_params(param)# reset model to starting state + return fs + + + diff --git a/GPy/inference/scg.py b/GPy/inference/scg.py new file mode 100644 index 00000000..f4c7c9c4 --- /dev/null +++ b/GPy/inference/scg.py @@ -0,0 +1,191 @@ +# Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2012) + +# Scaled Conjuagte Gradients, originally in Matlab as part of the Netlab toolbox by I. Nabney, converted to python N. Lawrence and given a pythonic interface by James Hensman + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT +# HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT +# NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +import numpy as np +import sys + + +def print_out(len_maxiters, fnow, current_grad, beta, iteration): + print '\r', + print '{0:>0{mi}g} {1:> 12e} {2:> 12e} {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, ' Scale:', beta, '\r', + sys.stdout.flush() + +def exponents(fnow, current_grad): + exps = [np.abs(fnow), current_grad] + return np.sign(exps) * np.log10(exps).astype(int) + +def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True, xtol=None, ftol=None, gtol=None): + """ + Optimisation through Scaled Conjugate Gradients (SCG) + + f: the objective function + gradf : the gradient function (should return a 1D np.ndarray) + x : the initial condition + + Returns + x the optimal value for x + flog : a list of all the objective values + function_eval number of fn evaluations + status: string describing convergence status + """ + if xtol is None: + xtol = 1e-6 + if ftol is None: + ftol = 1e-6 + if gtol is None: + gtol = 1e-5 + + sigma0 = 1.0e-8 + fold = f(x, *optargs) # Initial function value. + function_eval = 1 + fnow = fold + gradnew = gradf(x, *optargs) # Initial gradient. + if any(np.isnan(gradnew)): + raise UnexpectedInfOrNan + current_grad = np.dot(gradnew, gradnew) + gradold = gradnew.copy() + d = -gradnew # Initial search direction. + success = True # Force calculation of directional derivs. + nsuccess = 0 # nsuccess counts number of successes. + beta = 1.0 # Initial scale parameter. + betamin = 1.0e-60 # Lower bound on scale. + betamax = 1.0e50 # Upper bound on scale. + status = "Not converged" + + flog = [fold] + + iteration = 0 + + len_maxiters = len(str(maxiters)) + if display: + print ' {0:{mi}s} {1:11s} {2:11s} {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters) + exps = exponents(fnow, current_grad) + p_iter = iteration + + # Main optimization loop. + while iteration < maxiters: + + # Calculate first and second directional derivatives. + if success: + mu = np.dot(d, gradnew) + if mu >= 0: + d = -gradnew + mu = np.dot(d, gradnew) + kappa = np.dot(d, d) + sigma = sigma0 / np.sqrt(kappa) + xplus = x + sigma * d + gplus = gradf(xplus, *optargs) + theta = np.dot(d, (gplus - gradnew)) / sigma + + # Increase effective curvature and evaluate step size alpha. + delta = theta + beta * kappa + if delta <= 0: + delta = beta * kappa + beta = beta - theta / kappa + + alpha = -mu / delta + + # Calculate the comparison ratio. + xnew = x + alpha * d + fnew = f(xnew, *optargs) + function_eval += 1 + +# if function_eval >= max_f_eval: +# status = "maximum number of function evaluations exceeded" +# break +# return x, flog, function_eval, status + + Delta = 2.*(fnew - fold) / (alpha * mu) + if Delta >= 0.: + success = True + nsuccess += 1 + x = xnew + fnow = fnew + else: + success = False + fnow = fold + + # Store relevant variables + flog.append(fnow) # Current function value + + iteration += 1 + if display: + print_out(len_maxiters, fnow, current_grad, beta, iteration) + n_exps = exponents(fnow, current_grad) + if iteration - p_iter >= 20 * np.random.rand(): + a = iteration >= p_iter * 2.78 + b = np.any(n_exps < exps) + if a or b: + p_iter = iteration + print '' + if b: + exps = n_exps + + if success: + # Test for termination + + if (np.abs(fnew - fold) < ftol): + status = 'converged - relative reduction in objective' + break +# return x, flog, function_eval, status + elif (np.max(np.abs(alpha * d)) < xtol): + status = 'converged - relative stepsize' + break + else: + # Update variables for new position + gradnew = gradf(x, *optargs) + current_grad = np.dot(gradnew, gradnew) + gradold = gradnew + fold = fnew + # If the gradient is zero then we are done. + if current_grad <= gtol: + status = 'converged - relative reduction in gradient' + break + # return x, flog, function_eval, status + + # Adjust beta according to comparison ratio. + if Delta < 0.25: + beta = min(4.0 * beta, betamax) + if Delta > 0.75: + beta = max(0.5 * beta, betamin) + + # Update search direction using Polak-Ribiere formula, or re-start + # in direction of negative gradient after nparams steps. + if nsuccess == x.size: + d = -gradnew +# beta = 1. # TODO: betareset!! + nsuccess = 0 + elif success: + Gamma = np.dot(gradold - gradnew, gradnew) / (mu) + d = Gamma * d - gradnew + else: + # If we get here, then we haven't terminated in the given number of + # iterations. + status = "maxiter exceeded" + + if display: + print_out(len_maxiters, fnow, current_grad, beta, iteration) + print "" + print status + return x, flog, function_eval, status diff --git a/GPy/inference/sgd.py b/GPy/inference/sgd.py new file mode 100644 index 00000000..e443f45a --- /dev/null +++ b/GPy/inference/sgd.py @@ -0,0 +1,356 @@ +import numpy as np +import scipy as sp +import scipy.sparse +from optimization import Optimizer +from scipy import linalg, optimize +import pylab as plt +import copy, sys, pickle + +class opt_SGD(Optimizer): + """ + Optimize using stochastic gradient descent. + + *** Parameters *** + Model: reference to the Model object + iterations: number of iterations + learning_rate: learning rate + momentum: momentum + + """ + + def __init__(self, start, iterations = 10, learning_rate = 1e-4, momentum = 0.9, model = None, messages = False, batch_size = 1, self_paced = False, center = True, iteration_file = None, learning_rate_adaptation=None, actual_iter=None, schedule=None, **kwargs): + self.opt_name = "Stochastic Gradient Descent" + + self.Model = model + self.iterations = iterations + self.momentum = momentum + self.learning_rate = learning_rate + self.x_opt = None + self.f_opt = None + self.messages = messages + self.batch_size = batch_size + self.self_paced = self_paced + self.center = center + self.param_traces = [('noise',[])] + self.iteration_file = iteration_file + self.learning_rate_adaptation = learning_rate_adaptation + self.actual_iter = actual_iter + if self.learning_rate_adaptation != None: + if self.learning_rate_adaptation == 'annealing': + self.learning_rate_0 = self.learning_rate + else: + self.learning_rate_0 = self.learning_rate.mean() + + self.schedule = schedule + # if len([p for p in self.model.kern.parts if p.name == 'bias']) == 1: + # self.param_traces.append(('bias',[])) + # if len([p for p in self.model.kern.parts if p.name == 'linear']) == 1: + # self.param_traces.append(('linear',[])) + # if len([p for p in self.model.kern.parts if p.name == 'rbf']) == 1: + # self.param_traces.append(('rbf_var',[])) + + self.param_traces = dict(self.param_traces) + self.fopt_trace = [] + + num_params = len(self.Model._get_params()) + if isinstance(self.learning_rate, float): + self.learning_rate = np.ones((num_params,)) * self.learning_rate + + assert (len(self.learning_rate) == num_params), "there must be one learning rate per parameter" + + def __str__(self): + status = "\nOptimizer: \t\t\t %s\n" % self.opt_name + status += "f(x_opt): \t\t\t %.4f\n" % self.f_opt + status += "Number of iterations: \t\t %d\n" % self.iterations + status += "Learning rate: \t\t\t max %.3f, min %.3f\n" % (self.learning_rate.max(), self.learning_rate.min()) + status += "Momentum: \t\t\t %.3f\n" % self.momentum + status += "Batch size: \t\t\t %d\n" % self.batch_size + status += "Time elapsed: \t\t\t %s\n" % self.time + return status + + def plot_traces(self): + plt.figure() + plt.subplot(211) + plt.title('Parameters') + for k in self.param_traces.keys(): + plt.plot(self.param_traces[k], label=k) + plt.legend(loc=0) + plt.subplot(212) + plt.title('Objective function') + plt.plot(self.fopt_trace) + + + def non_null_samples(self, data): + return (np.isnan(data).sum(axis=1) == 0) + + def check_for_missing(self, data): + if sp.sparse.issparse(self.Model.likelihood.Y): + return True + else: + return np.isnan(data).sum() > 0 + + def subset_parameter_vector(self, x, samples, param_shapes): + subset = np.array([], dtype = int) + x = np.arange(0, len(x)) + i = 0 + + for s in param_shapes: + N, input_dim = s + X = x[i:i+N*input_dim].reshape(N, input_dim) + X = X[samples] + subset = np.append(subset, X.flatten()) + i += N*input_dim + + subset = np.append(subset, x[i:]) + + return subset + + def shift_constraints(self, j): + + constrained_indices = copy.deepcopy(self.Model.constrained_indices) + + for c, constraint in enumerate(constrained_indices): + mask = (np.ones_like(constrained_indices[c]) == 1) + for i in range(len(constrained_indices[c])): + pos = np.where(j == constrained_indices[c][i])[0] + if len(pos) == 1: + self.Model.constrained_indices[c][i] = pos + else: + mask[i] = False + + self.Model.constrained_indices[c] = self.Model.constrained_indices[c][mask] + return constrained_indices + # back them up + # bounded_i = copy.deepcopy(self.Model.constrained_bounded_indices) + # bounded_l = copy.deepcopy(self.Model.constrained_bounded_lowers) + # bounded_u = copy.deepcopy(self.Model.constrained_bounded_uppers) + + # for b in range(len(bounded_i)): # for each group of constraints + # for bc in range(len(bounded_i[b])): + # pos = np.where(j == bounded_i[b][bc])[0] + # if len(pos) == 1: + # pos2 = np.where(self.Model.constrained_bounded_indices[b] == bounded_i[b][bc])[0][0] + # self.Model.constrained_bounded_indices[b][pos2] = pos[0] + # else: + # if len(self.Model.constrained_bounded_indices[b]) == 1: + # # if it's the last index to be removed + # # the logic here is just a mess. If we remove the last one, then all the + # # b-indices change and we have to iterate through everything to find our + # # current index. Can't deal with this right now. + # raise NotImplementedError + + # else: # just remove it from the indices + # mask = self.Model.constrained_bounded_indices[b] != bc + # self.Model.constrained_bounded_indices[b] = self.Model.constrained_bounded_indices[b][mask] + + + # # here we shif the positive constraints. We cycle through each positive + # # constraint + # positive = self.Model.constrained_positive_indices.copy() + # mask = (np.ones_like(positive) == 1) + # for p in range(len(positive)): + # # we now check whether the constrained index appears in the j vector + # # (the vector of the "active" indices) + # pos = np.where(j == self.Model.constrained_positive_indices[p])[0] + # if len(pos) == 1: + # self.Model.constrained_positive_indices[p] = pos + # else: + # mask[p] = False + # self.Model.constrained_positive_indices = self.Model.constrained_positive_indices[mask] + + # return (bounded_i, bounded_l, bounded_u), positive + + def restore_constraints(self, c):#b, p): + # self.Model.constrained_bounded_indices = b[0] + # self.Model.constrained_bounded_lowers = b[1] + # self.Model.constrained_bounded_uppers = b[2] + # self.Model.constrained_positive_indices = p + self.Model.constrained_indices = c + + def get_param_shapes(self, N = None, input_dim = None): + model_name = self.Model.__class__.__name__ + if model_name == 'GPLVM': + return [(N, input_dim)] + if model_name == 'Bayesian_GPLVM': + return [(N, input_dim), (N, input_dim)] + else: + raise NotImplementedError + + def step_with_missing_data(self, f_fp, X, step, shapes): + N, input_dim = X.shape + + if not sp.sparse.issparse(self.Model.likelihood.Y): + Y = self.Model.likelihood.Y + samples = self.non_null_samples(self.Model.likelihood.Y) + self.Model.N = samples.sum() + Y = Y[samples] + else: + samples = self.Model.likelihood.Y.nonzero()[0] + self.Model.N = len(samples) + Y = np.asarray(self.Model.likelihood.Y[samples].todense(), dtype = np.float64) + + if self.Model.N == 0 or Y.std() == 0.0: + return 0, step, self.Model.N + + self.Model.likelihood._offset = Y.mean() + self.Model.likelihood._scale = Y.std() + self.Model.likelihood.set_data(Y) + # self.Model.likelihood.V = self.Model.likelihood.Y*self.Model.likelihood.precision + + sigma = self.Model.likelihood._variance + self.Model.likelihood._variance = None # invalidate cache + self.Model.likelihood._set_params(sigma) + + + j = self.subset_parameter_vector(self.x_opt, samples, shapes) + self.Model.X = X[samples] + + model_name = self.Model.__class__.__name__ + + if model_name == 'Bayesian_GPLVM': + self.Model.likelihood.YYT = np.dot(self.Model.likelihood.Y, self.Model.likelihood.Y.T) + self.Model.likelihood.trYYT = np.trace(self.Model.likelihood.YYT) + + ci = self.shift_constraints(j) + f, fp = f_fp(self.x_opt[j]) + + step[j] = self.momentum * step[j] + self.learning_rate[j] * fp + self.x_opt[j] -= step[j] + self.restore_constraints(ci) + + self.Model.grads[j] = fp + # restore likelihood _offset and _scale, otherwise when we call set_data(y) on + # the next feature, it will get normalized with the mean and std of this one. + self.Model.likelihood._offset = 0 + self.Model.likelihood._scale = 1 + + return f, step, self.Model.N + + def adapt_learning_rate(self, t, D): + if self.learning_rate_adaptation == 'adagrad': + if t > 0: + g_k = self.Model.grads + self.s_k += np.square(g_k) + t0 = 100.0 + self.learning_rate = 0.1/(t0 + np.sqrt(self.s_k)) + + import pdb; pdb.set_trace() + else: + self.learning_rate = np.zeros_like(self.learning_rate) + self.s_k = np.zeros_like(self.x_opt) + + elif self.learning_rate_adaptation == 'annealing': + #self.learning_rate = self.learning_rate_0/(1+float(t+1)/10) + self.learning_rate = np.ones_like(self.learning_rate) * self.schedule[t] + + + elif self.learning_rate_adaptation == 'semi_pesky': + if self.Model.__class__.__name__ == 'Bayesian_GPLVM': + g_t = self.Model.grads + if t == 0: + self.hbar_t = 0.0 + self.tau_t = 100.0 + self.gbar_t = 0.0 + + self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t + self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t) + self.learning_rate = np.ones_like(self.learning_rate)*(np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t) + tau_t = self.tau_t*(1-self.learning_rate) + 1 + + + def opt(self, f_fp=None, f=None, fp=None): + self.x_opt = self.Model._get_params_transformed() + self.grads = [] + + X, Y = self.Model.X.copy(), self.Model.likelihood.Y.copy() + + self.Model.likelihood.YYT = 0 + self.Model.likelihood.trYYT = 0 + self.Model.likelihood._offset = 0.0 + self.Model.likelihood._scale = 1.0 + + N, input_dim = self.Model.X.shape + D = self.Model.likelihood.Y.shape[1] + num_params = self.Model._get_params() + self.trace = [] + missing_data = self.check_for_missing(self.Model.likelihood.Y) + + step = np.zeros_like(num_params) + for it in range(self.iterations): + if self.actual_iter != None: + it = self.actual_iter + + self.Model.grads = np.zeros_like(self.x_opt) # TODO this is ugly + + if it == 0 or self.self_paced is False: + features = np.random.permutation(Y.shape[1]) + else: + features = np.argsort(NLL) + + b = len(features)/self.batch_size + features = [features[i::b] for i in range(b)] + NLL = [] + import pylab as plt + for count, j in enumerate(features): + self.Model.input_dim = len(j) + self.Model.likelihood.input_dim = len(j) + self.Model.likelihood.set_data(Y[:, j]) + # self.Model.likelihood.V = self.Model.likelihood.Y*self.Model.likelihood.precision + + sigma = self.Model.likelihood._variance + self.Model.likelihood._variance = None # invalidate cache + self.Model.likelihood._set_params(sigma) + + if missing_data: + shapes = self.get_param_shapes(N, input_dim) + f, step, Nj = self.step_with_missing_data(f_fp, X, step, shapes) + else: + self.Model.likelihood.YYT = np.dot(self.Model.likelihood.Y, self.Model.likelihood.Y.T) + self.Model.likelihood.trYYT = np.trace(self.Model.likelihood.YYT) + Nj = N + f, fp = f_fp(self.x_opt) + self.Model.grads = fp.copy() + step = self.momentum * step + self.learning_rate * fp + self.x_opt -= step + + if self.messages == 2: + noise = self.Model.likelihood._variance + status = "evaluating {feature: 5d}/{tot: 5d} \t f: {f: 2.3f} \t non-missing: {nm: 4d}\t noise: {noise: 2.4f}\r".format(feature = count, tot = len(features), f = f, nm = Nj, noise = noise) + sys.stdout.write(status) + sys.stdout.flush() + self.param_traces['noise'].append(noise) + + self.adapt_learning_rate(it+count, D) + NLL.append(f) + self.fopt_trace.append(NLL[-1]) + # fig = plt.figure('traces') + # plt.clf() + # plt.plot(self.param_traces['noise']) + + # for k in self.param_traces.keys(): + # self.param_traces[k].append(self.Model.get(k)[0]) + self.grads.append(self.Model.grads.tolist()) + # should really be a sum(), but earlier samples in the iteration will have a very crappy ll + self.f_opt = np.mean(NLL) + self.Model.N = N + self.Model.X = X + self.Model.input_dim = D + self.Model.likelihood.N = N + self.Model.likelihood.input_dim = D + self.Model.likelihood.Y = Y + sigma = self.Model.likelihood._variance + self.Model.likelihood._variance = None # invalidate cache + self.Model.likelihood._set_params(sigma) + + self.trace.append(self.f_opt) + if self.iteration_file is not None: + f = open(self.iteration_file + "iteration%d.pickle" % it, 'w') + data = [self.x_opt, self.fopt_trace, self.param_traces] + pickle.dump(data, f) + f.close() + + if self.messages != 0: + sys.stdout.write('\r' + ' '*len(status)*2 + ' \r') + status = "SGD Iteration: {0: 3d}/{1: 3d} f: {2: 2.3f} max eta: {3: 1.5f}\n".format(it+1, self.iterations, self.f_opt, self.learning_rate.max()) + sys.stdout.write(status) + sys.stdout.flush() diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py new file mode 100644 index 00000000..eb4076c3 --- /dev/null +++ b/GPy/kern/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +from constructors import * +try: + from constructors import rbf_sympy, sympykern # these depend on sympy +except: + pass +from kern import * diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py new file mode 100644 index 00000000..f7c0fd67 --- /dev/null +++ b/GPy/kern/constructors.py @@ -0,0 +1,423 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +from kern import kern +import parts + + +def rbf_inv(input_dim,variance=1., inv_lengthscale=None,ARD=False): + """ + Construct an RBF kernel + + :param input_dim: dimensionality of the kernel, obligatory + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + :param lengthscale: the lengthscale of the kernel + :type lengthscale: float + :param ARD: Auto Relevance Determination (one lengthscale per dimension) + :type ARD: Boolean + """ + part = parts.rbf_inv.RBFInv(input_dim,variance,inv_lengthscale,ARD) + return kern(input_dim, [part]) + +def rbf(input_dim,variance=1., lengthscale=None,ARD=False): + """ + Construct an RBF kernel + + :param input_dim: dimensionality of the kernel, obligatory + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + :param lengthscale: the lengthscale of the kernel + :type lengthscale: float + :param ARD: Auto Relevance Determination (one lengthscale per dimension) + :type ARD: Boolean + """ + part = parts.rbf.RBF(input_dim,variance,lengthscale,ARD) + return kern(input_dim, [part]) + +def linear(input_dim,variances=None,ARD=False): + """ + Construct a linear kernel. + + Arguments + --------- + input_dimD (int), obligatory + variances (np.ndarray) + ARD (boolean) + """ + part = parts.linear.Linear(input_dim,variances,ARD) + return kern(input_dim, [part]) + +def mlp(input_dim,variance=1., weight_variance=None,bias_variance=100.,ARD=False): + """ + Construct an MLP kernel + + :param input_dim: dimensionality of the kernel, obligatory + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + :param weight_scale: the lengthscale of the kernel + :type weight_scale: vector of weight variances for input weights in neural network (length 1 if kernel is isotropic) + :param bias_variance: the variance of the biases in the neural network. + :type bias_variance: float + :param ARD: Auto Relevance Determination (allows for ARD version of covariance) + :type ARD: Boolean + """ + part = parts.mlp.MLP(input_dim,variance,weight_variance,bias_variance,ARD) + return kern(input_dim, [part]) + +def gibbs(input_dim,variance=1., mapping=None): + """ + Gibbs and MacKay non-stationary covariance function. + + .. math:: + + r = sqrt((x_i - x_j)'*(x_i - x_j)) + + k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x'))) + + Z = \sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')} + + where :math:`l(x)` is a function giving the length scale as a function of space. + This is the non stationary kernel proposed by Mark Gibbs in his 1997 + thesis. It is similar to an RBF but has a length scale that varies + with input location. This leads to an additional term in front of + the kernel. + + The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used. + + :param input_dim: the number of input dimensions + :type input_dim: int + :param variance: the variance :math:`\sigma^2` + :type variance: float + :param mapping: the mapping that gives the lengthscale across the input space. + :type mapping: GPy.core.Mapping + :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension. + :type ARD: Boolean + :rtype: Kernpart object + + """ + part = parts.gibbs.Gibbs(input_dim,variance,mapping) + return kern(input_dim, [part]) + +def poly(input_dim,variance=1., weight_variance=None,bias_variance=1.,degree=2, ARD=False): + """ + Construct a polynomial kernel + + :param input_dim: dimensionality of the kernel, obligatory + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + :param weight_scale: the lengthscale of the kernel + :type weight_scale: vector of weight variances for input weights. + :param bias_variance: the variance of the biases. + :type bias_variance: float + :param degree: the degree of the polynomial + :type degree: int + :param ARD: Auto Relevance Determination (allows for ARD version of covariance) + :type ARD: Boolean + """ + part = parts.poly.POLY(input_dim,variance,weight_variance,bias_variance,degree,ARD) + return kern(input_dim, [part]) + +def white(input_dim,variance=1.): + """ + Construct a white kernel. + + Arguments + --------- + input_dimD (int), obligatory + variance (float) + """ + part = parts.white.White(input_dim,variance) + return kern(input_dim, [part]) + +def exponential(input_dim,variance=1., lengthscale=None, ARD=False): + """ + Construct an exponential kernel + + :param input_dim: dimensionality of the kernel, obligatory + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + :param lengthscale: the lengthscale of the kernel + :type lengthscale: float + :param ARD: Auto Relevance Determination (one lengthscale per dimension) + :type ARD: Boolean + """ + part = parts.exponential.Exponential(input_dim,variance, lengthscale, ARD) + return kern(input_dim, [part]) + +def Matern32(input_dim,variance=1., lengthscale=None, ARD=False): + """ + Construct a Matern 3/2 kernel. + + :param input_dim: dimensionality of the kernel, obligatory + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + :param lengthscale: the lengthscale of the kernel + :type lengthscale: float + :param ARD: Auto Relevance Determination (one lengthscale per dimension) + :type ARD: Boolean + """ + part = parts.Matern32.Matern32(input_dim,variance, lengthscale, ARD) + return kern(input_dim, [part]) + +def Matern52(input_dim, variance=1., lengthscale=None, ARD=False): + """ + Construct a Matern 5/2 kernel. + + :param input_dim: dimensionality of the kernel, obligatory + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + :param lengthscale: the lengthscale of the kernel + :type lengthscale: float + :param ARD: Auto Relevance Determination (one lengthscale per dimension) + :type ARD: Boolean + """ + part = parts.Matern52.Matern52(input_dim, variance, lengthscale, ARD) + return kern(input_dim, [part]) + +def bias(input_dim, variance=1.): + """ + Construct a bias kernel. + + Arguments + --------- + input_dim (int), obligatory + variance (float) + """ + part = parts.bias.Bias(input_dim, variance) + return kern(input_dim, [part]) + +def finite_dimensional(input_dim, F, G, variances=1., weights=None): + """ + Construct a finite dimensional kernel. + input_dim: int - the number of input dimensions + F: np.array of functions with shape (n,) - the n basis functions + G: np.array with shape (n,n) - the Gram matrix associated to F + variances : np.ndarray with shape (n,) + """ + part = parts.finite_dimensional.FiniteDimensional(input_dim, F, G, variances, weights) + return kern(input_dim, [part]) + +def spline(input_dim, variance=1.): + """ + Construct a spline kernel. + + :param input_dim: Dimensionality of the kernel + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + """ + part = parts.spline.Spline(input_dim, variance) + return kern(input_dim, [part]) + +def Brownian(input_dim, variance=1.): + """ + Construct a Brownian motion kernel. + + :param input_dim: Dimensionality of the kernel + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + """ + part = parts.Brownian.Brownian(input_dim, variance) + return kern(input_dim, [part]) + +try: + import sympy as sp + from sympykern import spkern + from sympy.parsing.sympy_parser import parse_expr + sympy_available = True +except ImportError: + sympy_available = False + +if sympy_available: + def rbf_sympy(input_dim, ARD=False, variance=1., lengthscale=1.): + """ + Radial Basis Function covariance. + """ + X = [sp.var('x%i' % i) for i in range(input_dim)] + Z = [sp.var('z%i' % i) for i in range(input_dim)] + rbf_variance = sp.var('rbf_variance',positive=True) + if ARD: + rbf_lengthscales = [sp.var('rbf_lengthscale_%i' % i, positive=True) for i in range(input_dim)] + dist_string = ' + '.join(['(x%i-z%i)**2/rbf_lengthscale_%i**2' % (i, i, i) for i in range(input_dim)]) + dist = parse_expr(dist_string) + f = rbf_variance*sp.exp(-dist/2.) + else: + rbf_lengthscale = sp.var('rbf_lengthscale',positive=True) + dist_string = ' + '.join(['(x%i-z%i)**2' % (i, i) for i in range(input_dim)]) + dist = parse_expr(dist_string) + f = rbf_variance*sp.exp(-dist/(2*rbf_lengthscale**2)) + return kern(input_dim, [spkern(input_dim, f)]) + + def sympykern(input_dim, k): + """ + A kernel from a symbolic sympy representation + """ + return kern(input_dim, [spkern(input_dim, k)]) +del sympy_available + +def periodic_exponential(input_dim=1, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi): + """ + Construct an periodic exponential kernel + + :param input_dim: dimensionality, only defined for input_dim=1 + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + :param lengthscale: the lengthscale of the kernel + :type lengthscale: float + :param period: the period + :type period: float + :param n_freq: the number of frequencies considered for the periodic subspace + :type n_freq: int + """ + part = parts.periodic_exponential.PeriodicExponential(input_dim, variance, lengthscale, period, n_freq, lower, upper) + return kern(input_dim, [part]) + +def periodic_Matern32(input_dim, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi): + """ + Construct a periodic Matern 3/2 kernel. + + :param input_dim: dimensionality, only defined for input_dim=1 + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + :param lengthscale: the lengthscale of the kernel + :type lengthscale: float + :param period: the period + :type period: float + :param n_freq: the number of frequencies considered for the periodic subspace + :type n_freq: int + """ + part = parts.periodic_Matern32.PeriodicMatern32(input_dim, variance, lengthscale, period, n_freq, lower, upper) + return kern(input_dim, [part]) + +def periodic_Matern52(input_dim, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi): + """ + Construct a periodic Matern 5/2 kernel. + + :param input_dim: dimensionality, only defined for input_dim=1 + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + :param lengthscale: the lengthscale of the kernel + :type lengthscale: float + :param period: the period + :type period: float + :param n_freq: the number of frequencies considered for the periodic subspace + :type n_freq: int + """ + part = parts.periodic_Matern52.PeriodicMatern52(input_dim, variance, lengthscale, period, n_freq, lower, upper) + return kern(input_dim, [part]) + +def prod(k1,k2,tensor=False): + """ + Construct a product kernel over input_dim from two kernels over input_dim + + :param k1, k2: the kernels to multiply + :type k1, k2: kernpart + :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces + :type tensor: Boolean + :rtype: kernel object + """ + part = parts.prod.Prod(k1, k2, tensor) + return kern(part.input_dim, [part]) + +def symmetric(k): + """ + Construct a symmetric kernel from an existing kernel + """ + k_ = k.copy() + k_.parts = [symmetric.Symmetric(p) for p in k.parts] + return k_ + +def coregionalise(output_dim, rank=1, W=None, kappa=None): + """ + Coregionalisation kernel. + + Used for computing covariance functions of the form + .. math:: + k_2(x, y)=\mathbf{B} k(x, y) + where + .. math:: + \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I} + + :param output_dim: the number of output dimensions + :type output_dim: int + :param rank: the rank of the coregionalisation matrix. + :type rank: int + :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B. + :type W: ndarray + :param kappa: a diagonal term which allows the outputs to behave independently. + :rtype: kernel object + + .. Note: see coregionalisation examples in GPy.examples.regression for some usage. + """ + p = parts.coregionalise.Coregionalise(output_dim,rank,W,kappa) + return kern(1,[p]) + + +def rational_quadratic(input_dim, variance=1., lengthscale=1., power=1.): + """ + Construct rational quadratic kernel. + + :param input_dim: the number of input dimensions + :type input_dim: int (input_dim=1 is the only value currently supported) + :param variance: the variance :math:`\sigma^2` + :type variance: float + :param lengthscale: the lengthscale :math:`\ell` + :type lengthscale: float + :rtype: kern object + + """ + part = parts.rational_quadratic.RationalQuadratic(input_dim, variance, lengthscale, power) + return kern(input_dim, [part]) + +def fixed(input_dim, K, variance=1.): + """ + Construct a Fixed effect kernel. + + :param input_dim: the number of input dimensions + :type input_dim: int (input_dim=1 is the only value currently supported) + :param K: the variance :math:`\sigma^2` + :type K: np.array + :param variance: kernel variance + :type variance: float + :rtype: kern object + """ + part = parts.fixed.Fixed(input_dim, K, variance) + return kern(input_dim, [part]) + +def rbfcos(input_dim, variance=1., frequencies=None, bandwidths=None, ARD=False): + """ + construct a rbfcos kernel + """ + part = parts.rbfcos.RBFCos(input_dim, variance, frequencies, bandwidths, ARD) + return kern(input_dim, [part]) + +def independent_outputs(k): + """ + Construct a kernel with independent outputs from an existing kernel + """ + for sl in k.input_slices: + assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)" + _parts = [parts.independent_outputs.IndependentOutputs(p) for p in k.parts] + return kern(k.input_dim+1,_parts) + +def hierarchical(k): + """ + TODO THis can't be right! Construct a kernel with independent outputs from an existing kernel + """ + # for sl in k.input_slices: + # assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)" + _parts = [parts.hierarchical.Hierarchical(k.parts)] + return kern(k.input_dim+len(k.parts),_parts) diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py new file mode 100644 index 00000000..d9928295 --- /dev/null +++ b/GPy/kern/kern.py @@ -0,0 +1,669 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +import pylab as pb +from ..core.parameterized import Parameterized +from parts.kernpart import Kernpart +import itertools +from parts.prod import Prod as prod +from matplotlib.transforms import offset_copy + +class kern(Parameterized): + def __init__(self, input_dim, parts=[], input_slices=None): + """ + This is the main kernel class for GPy. It handles multiple (additive) kernel functions, and keeps track of variaous things like which parameters live where. + + The technical code for kernels is divided into _parts_ (see + e.g. rbf.py). This object contains a list of parts, which are + computed additively. For multiplication, special _prod_ parts + are used. + + :param input_dim: The dimensionality of the kernel's input space + :type input_dim: int + :param parts: the 'parts' (PD functions) of the kernel + :type parts: list of Kernpart objects + :param input_slices: the slices on the inputs which apply to each kernel + :type input_slices: list of slice objects, or list of bools + + """ + self.parts = parts + self.Nparts = len(parts) + self.num_params = sum([p.num_params for p in self.parts]) + + self.input_dim = input_dim + + # deal with input_slices + if input_slices is None: + self.input_slices = [slice(None) for p in self.parts] + else: + assert len(input_slices) == len(self.parts) + self.input_slices = [sl if type(sl) is slice else slice(None) for sl in input_slices] + + for p in self.parts: + assert isinstance(p, Kernpart), "bad kernel part" + + self.compute_param_slices() + + Parameterized.__init__(self) + + def getstate(self): + """ + Get the current state of the class, + here just all the indices, rest can get recomputed + """ + return Parameterized.getstate(self) + [self.parts, + self.Nparts, + self.num_params, + self.input_dim, + self.input_slices, + self.param_slices + ] + + def setstate(self, state): + self.param_slices = state.pop() + self.input_slices = state.pop() + self.input_dim = state.pop() + self.num_params = state.pop() + self.Nparts = state.pop() + self.parts = state.pop() + Parameterized.setstate(self, state) + + + def plot_ARD(self, fignum=None, ax=None, title='', legend=False): + """If an ARD kernel is present, it bar-plots the ARD parameters, + :param fignum: figure number of the plot + :param ax: matplotlib axis to plot on + :param title: + title of the plot, + pass '' to not print a title + pass None for a generic title + """ + if ax is None: + fig = pb.figure(fignum) + ax = fig.add_subplot(111) + else: + fig = ax.figure + from GPy.util import Tango + from matplotlib.textpath import TextPath + Tango.reset() + xticklabels = [] + bars = [] + x0 = 0 + for p in self.parts: + c = Tango.nextMedium() + if hasattr(p, 'ARD') and p.ARD: + if title is None: + ax.set_title('ARD parameters, %s kernel' % p.name) + else: + ax.set_title(title) + if p.name == 'linear': + ard_params = p.variances + else: + ard_params = 1. / p.lengthscale + + x = np.arange(x0, x0 + len(ard_params)) + bars.append(ax.bar(x, ard_params, align='center', color=c, edgecolor='k', linewidth=1.2, label=p.name)) + xticklabels.extend([r"$\mathrm{{{name}}}\ {x}$".format(name=p.name, x=i) for i in np.arange(len(ard_params))]) + x0 += len(ard_params) + x = np.arange(x0) + transOffset = offset_copy(ax.transData, fig=fig, + x=0., y= -2., units='points') + transOffsetUp = offset_copy(ax.transData, fig=fig, + x=0., y=1., units='points') + for bar in bars: + for patch, num in zip(bar.patches, np.arange(len(bar.patches))): + height = patch.get_height() + xi = patch.get_x() + patch.get_width() / 2. + va = 'top' + c = 'w' + t = TextPath((0, 0), "${xi}$".format(xi=xi), rotation=0, usetex=True, ha='center') + transform = transOffset + if patch.get_extents().height <= t.get_extents().height + 3: + va = 'bottom' + c = 'k' + transform = transOffsetUp + ax.text(xi, height, "${xi}$".format(xi=int(num)), color=c, rotation=0, ha='center', va=va, transform=transform) + # for xi, t in zip(x, xticklabels): + # ax.text(xi, maxi / 2, t, rotation=90, ha='center', va='center') + # ax.set_xticklabels(xticklabels, rotation=17) + ax.set_xticks([]) + ax.set_xlim(-.5, x0 - .5) + if legend: + if title is '': + mode = 'expand' + if len(bars) > 1: + mode = 'expand' + ax.legend(bbox_to_anchor=(0., 1.02, 1., 1.02), loc=3, + ncol=len(bars), mode=mode, borderaxespad=0.) + fig.tight_layout(rect=(0, 0, 1, .9)) + else: + ax.legend() + return ax + + def _transform_gradients(self, g): + x = self._get_params() + [np.put(x, i, x * t.gradfactor(x[i])) for i, t in zip(self.constrained_indices, self.constraints)] + [np.put(g, i, v) for i, v in [(t[0], np.sum(g[t])) for t in self.tied_indices]] + if len(self.tied_indices) or len(self.fixed_indices): + to_remove = np.hstack((self.fixed_indices + [t[1:] for t in self.tied_indices])) + return np.delete(g, to_remove) + else: + return g + + def compute_param_slices(self): + """create a set of slices that can index the parameters of each part.""" + self.param_slices = [] + count = 0 + for p in self.parts: + self.param_slices.append(slice(count, count + p.num_params)) + count += p.num_params + + def __add__(self, other): + """ + Shortcut for `add`. + """ + return self.add(other) + + def add(self, other, tensor=False): + """ + Add another kernel to this one. Both kernels are defined on the same _space_ + :param other: the other kernel to be added + :type other: GPy.kern + """ + if tensor: + D = self.input_dim + other.input_dim + self_input_slices = [slice(*sl.indices(self.input_dim)) for sl in self.input_slices] + other_input_indices = [sl.indices(other.input_dim) for sl in other.input_slices] + other_input_slices = [slice(i[0] + self.input_dim, i[1] + self.input_dim, i[2]) for i in other_input_indices] + + newkern = kern(D, self.parts + other.parts, self_input_slices + other_input_slices) + + # transfer constraints: + newkern.constrained_indices = self.constrained_indices + [x + self.num_params for x in other.constrained_indices] + newkern.constraints = self.constraints + other.constraints + newkern.fixed_indices = self.fixed_indices + [self.num_params + x for x in other.fixed_indices] + newkern.fixed_values = self.fixed_values + other.fixed_values + newkern.constraints = self.constraints + other.constraints + newkern.tied_indices = self.tied_indices + [self.num_params + x for x in other.tied_indices] + else: + assert self.input_dim == other.input_dim + newkern = kern(self.input_dim, self.parts + other.parts, self.input_slices + other.input_slices) + # transfer constraints: + newkern.constrained_indices = self.constrained_indices + [i + self.num_params for i in other.constrained_indices] + newkern.constraints = self.constraints + other.constraints + newkern.fixed_indices = self.fixed_indices + [self.num_params + x for x in other.fixed_indices] + newkern.fixed_values = self.fixed_values + other.fixed_values + newkern.tied_indices = self.tied_indices + [self.num_params + x for x in other.tied_indices] + return newkern + + def __mul__(self, other): + """ + Shortcut for `prod`. + """ + return self.prod(other) + + def __pow__(self, other, tensor=False): + """ + Shortcut for tensor `prod`. + """ + return self.prod(other, tensor=True) + + def prod(self, other, tensor=False): + """ + multiply two kernels (either on the same space, or on the tensor product of the input space). + :param other: the other kernel to be added + :type other: GPy.kern + :param tensor: whether or not to use the tensor space (default is false). + :type tensor: bool + """ + K1 = self.copy() + K2 = other.copy() + + slices = [] + for sl1, sl2 in itertools.product(K1.input_slices, K2.input_slices): + s1, s2 = [False] * K1.input_dim, [False] * K2.input_dim + s1[sl1], s2[sl2] = [True], [True] + slices += [s1 + s2] + + newkernparts = [prod(k1, k2, tensor) for k1, k2 in itertools.product(K1.parts, K2.parts)] + + if tensor: + newkern = kern(K1.input_dim + K2.input_dim, newkernparts, slices) + else: + newkern = kern(K1.input_dim, newkernparts, slices) + + newkern._follow_constrains(K1, K2) + return newkern + + def _follow_constrains(self, K1, K2): + + # Build the array that allows to go from the initial indices of the param to the new ones + K1_param = [] + n = 0 + for k1 in K1.parts: + K1_param += [range(n, n + k1.num_params)] + n += k1.num_params + n = 0 + K2_param = [] + for k2 in K2.parts: + K2_param += [range(K1.num_params + n, K1.num_params + n + k2.num_params)] + n += k2.num_params + index_param = [] + for p1 in K1_param: + for p2 in K2_param: + index_param += p1 + p2 + index_param = np.array(index_param) + + # Get the ties and constrains of the kernels before the multiplication + prev_ties = K1.tied_indices + [arr + K1.num_params for arr in K2.tied_indices] + + prev_constr_ind = [K1.constrained_indices] + [K1.num_params + i for i in K2.constrained_indices] + prev_constr = K1.constraints + K2.constraints + + # prev_constr_fix = K1.fixed_indices + [arr + K1.num_params for arr in K2.fixed_indices] + # prev_constr_fix_values = K1.fixed_values + K2.fixed_values + + # follow the previous ties + for arr in prev_ties: + for j in arr: + index_param[np.where(index_param == j)[0]] = arr[0] + + # ties and constrains + for i in range(K1.num_params + K2.num_params): + index = np.where(index_param == i)[0] + if index.size > 1: + self.tie_params(index) + for i, t in zip(prev_constr_ind, prev_constr): + self.constrain(np.where(index_param == i)[0], t) + + def _get_params(self): + return np.hstack([p._get_params() for p in self.parts]) + + def _set_params(self, x): + [p._set_params(x[s]) for p, s in zip(self.parts, self.param_slices)] + + def _get_param_names(self): + # this is a bit nasty: we want to distinguish between parts with the same name by appending a count + part_names = np.array([k.name for k in self.parts], dtype=np.str) + counts = [np.sum(part_names == ni) for i, ni in enumerate(part_names)] + cum_counts = [np.sum(part_names[i:] == ni) for i, ni in enumerate(part_names)] + names = [name + '_' + str(cum_count) if count > 1 else name for name, count, cum_count in zip(part_names, counts, cum_counts)] + + return sum([[name + '_' + n for n in k._get_param_names()] for name, k in zip(names, self.parts)], []) + + def K(self, X, X2=None, which_parts='all'): + if which_parts == 'all': + which_parts = [True] * self.Nparts + assert X.shape[1] == self.input_dim + if X2 is None: + target = np.zeros((X.shape[0], X.shape[0])) + [p.K(X[:, i_s], None, target=target) for p, i_s, part_i_used in zip(self.parts, self.input_slices, which_parts) if part_i_used] + else: + target = np.zeros((X.shape[0], X2.shape[0])) + [p.K(X[:, i_s], X2[:, i_s], target=target) for p, i_s, part_i_used in zip(self.parts, self.input_slices, which_parts) if part_i_used] + return target + + def dK_dtheta(self, dL_dK, X, X2=None): + """ + Compute the gradient of the covariance function with respect to the parameters. + + :param dL_dK: An array of gradients of the objective function with respect to the covariance function. + :type dL_dK: Np.ndarray (num_samples x num_inducing) + :param X: Observed data inputs + :type X: np.ndarray (num_samples x input_dim) + :param X2: Observed data inputs (optional, defaults to X) + :type X2: np.ndarray (num_inducing x input_dim) + """ + assert X.shape[1] == self.input_dim + target = np.zeros(self.num_params) + if X2 is None: + [p.dK_dtheta(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self.parts, self.input_slices, self.param_slices)] + else: + [p.dK_dtheta(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self.parts, self.input_slices, self.param_slices)] + + return self._transform_gradients(target) + + def dK_dX(self, dL_dK, X, X2=None): + """Compute the gradient of the covariance function with respect to X. + + :param dL_dK: An array of gradients of the objective function with respect to the covariance function. + :type dL_dK: np.ndarray (num_samples x num_inducing) + :param X: Observed data inputs + :type X: np.ndarray (num_samples x input_dim) + :param X2: Observed data inputs (optional, defaults to X) + :type X2: np.ndarray (num_inducing x input_dim)""" + if X2 is None: + X2 = X + target = np.zeros_like(X) + if X2 is None: + [p.dK_dX(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)] + else: + [p.dK_dX(dL_dK, X[:, i_s], X2[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)] + return target + + def Kdiag(self, X, which_parts='all'): + """Compute the diagonal of the covariance function for inputs X.""" + if which_parts == 'all': + which_parts = [True] * self.Nparts + assert X.shape[1] == self.input_dim + target = np.zeros(X.shape[0]) + [p.Kdiag(X[:, i_s], target=target) for p, i_s, part_on in zip(self.parts, self.input_slices, which_parts) if part_on] + return target + + def dKdiag_dtheta(self, dL_dKdiag, X): + """Compute the gradient of the diagonal of the covariance function with respect to the parameters.""" + assert X.shape[1] == self.input_dim + assert dL_dKdiag.size == X.shape[0] + target = np.zeros(self.num_params) + [p.dKdiag_dtheta(dL_dKdiag, X[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)] + return self._transform_gradients(target) + + def dKdiag_dX(self, dL_dKdiag, X): + assert X.shape[1] == self.input_dim + target = np.zeros_like(X) + [p.dKdiag_dX(dL_dKdiag, X[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)] + return target + + def psi0(self, Z, mu, S): + target = np.zeros(mu.shape[0]) + [p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)] + return target + + def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S): + target = np.zeros(self.num_params) + [p.dpsi0_dtheta(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self.parts, self.param_slices, self.input_slices)] + return self._transform_gradients(target) + + def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S): + target_mu, target_S = np.zeros_like(mu), np.zeros_like(S) + [p.dpsi0_dmuS(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)] + return target_mu, target_S + + def psi1(self, Z, mu, S): + target = np.zeros((mu.shape[0], Z.shape[0])) + [p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)] + return target + + def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S): + target = np.zeros((self.num_params)) + [p.dpsi1_dtheta(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self.parts, self.param_slices, self.input_slices)] + return self._transform_gradients(target) + + def dpsi1_dZ(self, dL_dpsi1, Z, mu, S): + target = np.zeros_like(Z) + [p.dpsi1_dZ(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)] + return target + + def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S): + """return shapes are num_samples,num_inducing,input_dim""" + target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1])) + [p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)] + return target_mu, target_S + + def psi2(self, Z, mu, S): + """ + Computer the psi2 statistics for the covariance function. + + :param Z: np.ndarray of inducing inputs (num_inducing x input_dim) + :param mu, S: np.ndarrays of means and variances (each num_samples x input_dim) + :returns psi2: np.ndarray (num_samples,num_inducing,num_inducing) + """ + target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0])) + [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)] + + # compute the "cross" terms + # TODO: input_slices needed + crossterms = 0 + + for [p1, i_s1], [p2, i_s2] in itertools.combinations(zip(self.parts, self.input_slices), 2): + if i_s1 == i_s2: + # TODO psi1 this must be faster/better/precached/more nice + tmp1 = np.zeros((mu.shape[0], Z.shape[0])) + p1.psi1(Z[:, i_s1], mu[:, i_s1], S[:, i_s1], tmp1) + tmp2 = np.zeros((mu.shape[0], Z.shape[0])) + p2.psi1(Z[:, i_s2], mu[:, i_s2], S[:, i_s2], tmp2) + + prod = np.multiply(tmp1, tmp2) + crossterms += prod[:, :, None] + prod[:, None, :] + + # target += crossterms + return target + crossterms + + def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S): + """Gradient of the psi2 statistics with respect to the parameters.""" + target = np.zeros(self.num_params) + [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)] + + # compute the "cross" terms + # TODO: better looping, input_slices + for i1, i2 in itertools.permutations(range(len(self.parts)), 2): + p1, p2 = self.parts[i1], self.parts[i2] +# ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2] + ps1, ps2 = self.param_slices[i1], self.param_slices[i2] + + tmp = np.zeros((mu.shape[0], Z.shape[0])) + p1.psi1(Z, mu, S, tmp) + p2.dpsi1_dtheta((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target[ps2]) + + return self._transform_gradients(target) + + def dpsi2_dZ(self, dL_dpsi2, Z, mu, S): + target = np.zeros_like(Z) + [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)] + # target *= 2 + + # compute the "cross" terms + # TODO: we need input_slices here. + for p1, p2 in itertools.permutations(self.parts, 2): + if p1.name == 'linear' and p2.name == 'linear': + raise NotImplementedError("We don't handle linear/linear cross-terms") + tmp = np.zeros((mu.shape[0], Z.shape[0])) + p1.psi1(Z, mu, S, tmp) + p2.dpsi1_dZ((tmp[:, None, :] * dL_dpsi2).sum(1), Z, mu, S, target) + + return target * 2 + + def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S): + target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1])) + [p.dpsi2_dmuS(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)] + + # compute the "cross" terms + # TODO: we need input_slices here. + for p1, p2 in itertools.permutations(self.parts, 2): + if p1.name == 'linear' and p2.name == 'linear': + raise NotImplementedError("We don't handle linear/linear cross-terms") + + tmp = np.zeros((mu.shape[0], Z.shape[0])) + p1.psi1(Z, mu, S, tmp) + p2.dpsi1_dmuS((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target_mu, target_S) + + return target_mu, target_S + + def plot(self, x=None, plot_limits=None, which_parts='all', resolution=None, *args, **kwargs): + if which_parts == 'all': + which_parts = [True] * self.Nparts + if self.input_dim == 1: + if x is None: + x = np.zeros((1, 1)) + else: + x = np.asarray(x) + assert x.size == 1, "The size of the fixed variable x is not 1" + x = x.reshape((1, 1)) + + if plot_limits == None: + xmin, xmax = (x - 5).flatten(), (x + 5).flatten() + elif len(plot_limits) == 2: + xmin, xmax = plot_limits + else: + raise ValueError, "Bad limits for plotting" + + Xnew = np.linspace(xmin, xmax, resolution or 201)[:, None] + Kx = self.K(Xnew, x, which_parts) + pb.plot(Xnew, Kx, *args, **kwargs) + pb.xlim(xmin, xmax) + pb.xlabel("x") + pb.ylabel("k(x,%0.1f)" % x) + + elif self.input_dim == 2: + if x is None: + x = np.zeros((1, 2)) + else: + x = np.asarray(x) + assert x.size == 2, "The size of the fixed variable x is not 2" + x = x.reshape((1, 2)) + + if plot_limits == None: + xmin, xmax = (x - 5).flatten(), (x + 5).flatten() + elif len(plot_limits) == 2: + xmin, xmax = plot_limits + else: + raise ValueError, "Bad limits for plotting" + + resolution = resolution or 51 + xx, yy = np.mgrid[xmin[0]:xmax[0]:1j * resolution, xmin[1]:xmax[1]:1j * resolution] + xg = np.linspace(xmin[0], xmax[0], resolution) + yg = np.linspace(xmin[1], xmax[1], resolution) + Xnew = np.vstack((xx.flatten(), yy.flatten())).T + Kx = self.K(Xnew, x, which_parts) + Kx = Kx.reshape(resolution, resolution).T + pb.contour(xg, yg, Kx, vmin=Kx.min(), vmax=Kx.max(), cmap=pb.cm.jet, *args, **kwargs) # @UndefinedVariable + pb.xlim(xmin[0], xmax[0]) + pb.ylim(xmin[1], xmax[1]) + pb.xlabel("x1") + pb.ylabel("x2") + pb.title("k(x1,x2 ; %0.1f,%0.1f)" % (x[0, 0], x[0, 1])) + else: + raise NotImplementedError, "Cannot plot a kernel with more than two input dimensions" + +from GPy.core.model import Model + +class Kern_check_model(Model): + """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel.""" + def __init__(self, kernel=None, dL_dK=None, X=None, X2=None): + num_samples = 20 + num_samples2 = 10 + if kernel==None: + kernel = GPy.kern.rbf(1) + if X==None: + X = np.random.randn(num_samples, kernel.input_dim) + if dL_dK==None: + if X2==None: + dL_dK = np.ones((X.shape[0], X.shape[0])) + else: + dL_dK = np.ones((X.shape[0], X2.shape[0])) + + self.kernel=kernel + self.X = X + self.X2 = X2 + self.dL_dK = dL_dK + #self.constrained_indices=[] + #self.constraints=[] + Model.__init__(self) + + def is_positive_definite(self): + v = np.linalg.eig(self.kernel.K(self.X))[0] + if any(v<0): + return False + else: + return True + + def _get_params(self): + return self.kernel._get_params() + + def _get_param_names(self): + return self.kernel._get_param_names() + + def _set_params(self, x): + self.kernel._set_params(x) + + def log_likelihood(self): + return (self.dL_dK*self.kernel.K(self.X, self.X2)).sum() + + def _log_likelihood_gradients(self): + raise NotImplementedError, "This needs to be implemented to use the kern_check_model class." + +class Kern_check_dK_dtheta(Kern_check_model): + """This class allows gradient checks for the gradient of a kernel with respect to parameters. """ + def __init__(self, kernel=None, dL_dK=None, X=None, X2=None): + Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2) + + def _log_likelihood_gradients(self): + return self.kernel.dK_dtheta(self.dL_dK, self.X, self.X2) + +class Kern_check_dKdiag_dtheta(Kern_check_model): + """This class allows gradient checks of the gradient of the diagonal of a kernel with respect to the parameters.""" + def __init__(self, kernel=None, dL_dK=None, X=None): + Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None) + if dL_dK==None: + self.dL_dK = np.ones((self.X.shape[0])) + + def log_likelihood(self): + return (self.dL_dK*self.kernel.Kdiag(self.X)).sum() + + def _log_likelihood_gradients(self): + return self.kernel.dKdiag_dtheta(self.dL_dK, self.X) + +class Kern_check_dK_dX(Kern_check_model): + """This class allows gradient checks for the gradient of a kernel with respect to X. """ + def __init__(self, kernel=None, dL_dK=None, X=None, X2=None): + Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2) + + def _log_likelihood_gradients(self): + return self.kernel.dK_dX(self.dL_dK, self.X, self.X2).flatten() + + def _get_param_names(self): + return ['X_' +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])] + + def _get_params(self): + return self.X.flatten() + + def _set_params(self, x): + self.X=x.reshape(self.X.shape) + +class Kern_check_dKdiag_dX(Kern_check_model): + """This class allows gradient checks for the gradient of a kernel diagonal with respect to X. """ + def __init__(self, kernel=None, dL_dK=None, X=None, X2=None): + Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None) + if dL_dK==None: + self.dL_dK = np.ones((self.X.shape[0])) + + def log_likelihood(self): + return (self.dL_dK*self.kernel.Kdiag(self.X)).sum() + + def _log_likelihood_gradients(self): + return self.kernel.dKdiag_dX(self.dL_dK, self.X).flatten() + + def _get_param_names(self): + return ['X_' +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])] + + def _get_params(self): + return self.X.flatten() + + def _set_params(self, x): + self.X=x.reshape(self.X.shape) + +def kern_test(kern, X=None, X2=None, verbose=False): + """This function runs on kernels to check the correctness of their implementation. It checks that the covariance function is positive definite for a randomly generated data set. + + :param kern: the kernel to be tested. + :type kern: GPy.kern.Kernpart + :param X: X input values to test the covariance function. + :type X: ndarray + :param X2: X2 input values to test the covariance function. + :type X2: ndarray + """ + if X==None: + X = np.random.randn(10, kern.input_dim) + if X2==None: + X2 = np.random.randn(20, kern.input_dim) + result = [Kern_check_model(kern, X=X).is_positive_definite(), + Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose), + Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=verbose), + Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose), + Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose), + Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)] + # Need to check + #Kern_check_dK_dX(kern, X, X2=None).checkgrad(verbose=verbose)] + # but currently I think these aren't implemented. + return np.all(result) diff --git a/GPy/kern/parts/Brownian.py b/GPy/kern/parts/Brownian.py new file mode 100644 index 00000000..bdfa0df5 --- /dev/null +++ b/GPy/kern/parts/Brownian.py @@ -0,0 +1,65 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +from kernpart import Kernpart +import numpy as np + +def theta(x): + """Heavisdie step function""" + return np.where(x>=0.,1.,0.) + +class Brownian(Kernpart): + """ + Brownian Motion kernel. + + :param input_dim: the number of input dimensions + :type input_dim: int + :param variance: + :type variance: float + """ + def __init__(self,input_dim,variance=1.): + self.input_dim = input_dim + assert self.input_dim==1, "Brownian motion in 1D only" + self.num_params = 1 + self.name = 'Brownian' + self._set_params(np.array([variance]).flatten()) + + def _get_params(self): + return self.variance + + def _set_params(self,x): + assert x.shape==(1,) + self.variance = x + + def _get_param_names(self): + return ['variance'] + + def K(self,X,X2,target): + if X2 is None: + X2 = X + target += self.variance*np.fmin(X,X2.T) + + def Kdiag(self,X,target): + target += self.variance*X.flatten() + + def dK_dtheta(self,dL_dK,X,X2,target): + if X2 is None: + X2 = X + target += np.sum(np.fmin(X,X2.T)*dL_dK) + + def dKdiag_dtheta(self,dL_dKdiag,X,target): + target += np.dot(X.flatten(), dL_dKdiag) + + def dK_dX(self,dL_dK,X,X2,target): + raise NotImplementedError, "TODO" + #target += self.variance + #target -= self.variance*theta(X-X2.T) + #if X.shape==X2.shape: + #if np.all(X==X2): + #np.add(target[:,:,0],self.variance*np.diag(X2.flatten()-X.flatten()),target[:,:,0]) + + + def dKdiag_dX(self,dL_dKdiag,X,target): + target += self.variance*dL_dKdiag[:,None] + diff --git a/GPy/kern/parts/Matern32.py b/GPy/kern/parts/Matern32.py new file mode 100644 index 00000000..60f0b6e9 --- /dev/null +++ b/GPy/kern/parts/Matern32.py @@ -0,0 +1,135 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +from kernpart import Kernpart +import numpy as np +from scipy import integrate + +class Matern32(Kernpart): + """ + Matern 3/2 kernel: + + .. math:: + + k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\ \\text{ where } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} } + + :param input_dim: the number of input dimensions + :type input_dim: int + :param variance: the variance :math:`\sigma^2` + :type variance: float + :param lengthscale: the vector of lengthscale :math:`\ell_i` + :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter) + :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension. + :type ARD: Boolean + :rtype: kernel object + + """ + + def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False): + self.input_dim = input_dim + self.ARD = ARD + if ARD == False: + self.num_params = 2 + self.name = 'Mat32' + if lengthscale is not None: + lengthscale = np.asarray(lengthscale) + assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel" + else: + lengthscale = np.ones(1) + else: + self.num_params = self.input_dim + 1 + self.name = 'Mat32' + if lengthscale is not None: + lengthscale = np.asarray(lengthscale) + assert lengthscale.size == self.input_dim, "bad number of lengthscales" + else: + lengthscale = np.ones(self.input_dim) + self._set_params(np.hstack((variance, lengthscale.flatten()))) + + def _get_params(self): + """return the value of the parameters.""" + return np.hstack((self.variance, self.lengthscale)) + + def _set_params(self, x): + """set the value of the parameters.""" + assert x.size == self.num_params + self.variance = x[0] + self.lengthscale = x[1:] + + def _get_param_names(self): + """return parameter names.""" + if self.num_params == 2: + return ['variance', 'lengthscale'] + else: + return ['variance'] + ['lengthscale_%i' % i for i in range(self.lengthscale.size)] + + def K(self, X, X2, target): + """Compute the covariance matrix between X and X2.""" + if X2 is None: X2 = X + dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1)) + np.add(self.variance * (1 + np.sqrt(3.) * dist) * np.exp(-np.sqrt(3.) * dist), target, target) + + def Kdiag(self, X, target): + """Compute the diagonal of the covariance matrix associated to X.""" + np.add(target, self.variance, target) + + def dK_dtheta(self, dL_dK, X, X2, target): + """derivative of the covariance matrix with respect to the parameters.""" + if X2 is None: X2 = X + dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1)) + dvar = (1 + np.sqrt(3.) * dist) * np.exp(-np.sqrt(3.) * dist) + invdist = 1. / np.where(dist != 0., dist, np.inf) + dist2M = np.square(X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 3 + # dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis] + target[0] += np.sum(dvar * dL_dK) + if self.ARD == True: + dl = (self.variance * 3 * dist * np.exp(-np.sqrt(3.) * dist))[:, :, np.newaxis] * dist2M * invdist[:, :, np.newaxis] + # dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None] + target[1:] += (dl * dL_dK[:, :, None]).sum(0).sum(0) + else: + dl = (self.variance * 3 * dist * np.exp(-np.sqrt(3.) * dist)) * dist2M.sum(-1) * invdist + # dl = self.variance*dvar*dist2M.sum(-1)*invdist + target[1] += np.sum(dl * dL_dK) + + def dKdiag_dtheta(self, dL_dKdiag, X, target): + """derivative of the diagonal of the covariance matrix with respect to the parameters.""" + target[0] += np.sum(dL_dKdiag) + + def dK_dX(self, dL_dK, X, X2, target): + """derivative of the covariance matrix with respect to X.""" + if X2 is None: X2 = X + dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None] + ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf) + dK_dX = -np.transpose(3 * self.variance * dist * np.exp(-np.sqrt(3) * dist) * ddist_dX, (1, 0, 2)) + target += np.sum(dK_dX * dL_dK.T[:, :, None], 0) + + def dKdiag_dX(self, dL_dKdiag, X, target): + pass + + def Gram_matrix(self, F, F1, F2, lower, upper): + """ + Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1. + + :param F: vector of functions + :type F: np.array + :param F1: vector of derivatives of F + :type F1: np.array + :param F2: vector of second derivatives of F + :type F2: np.array + :param lower,upper: boundaries of the input domain + :type lower,upper: floats + """ + assert self.input_dim == 1 + def L(x, i): + return(3. / self.lengthscale ** 2 * F[i](x) + 2 * np.sqrt(3) / self.lengthscale * F1[i](x) + F2[i](x)) + n = F.shape[0] + G = np.zeros((n, n)) + for i in range(n): + for j in range(i, n): + G[i, j] = G[j, i] = integrate.quad(lambda x : L(x, i) * L(x, j), lower, upper)[0] + Flower = np.array([f(lower) for f in F])[:, None] + F1lower = np.array([f(lower) for f in F1])[:, None] + # print "OLD \n", np.dot(F1lower,F1lower.T), "\n \n" + # return(G) + return(self.lengthscale ** 3 / (12.*np.sqrt(3) * self.variance) * G + 1. / self.variance * np.dot(Flower, Flower.T) + self.lengthscale ** 2 / (3.*self.variance) * np.dot(F1lower, F1lower.T)) diff --git a/GPy/kern/parts/Matern52.py b/GPy/kern/parts/Matern52.py new file mode 100644 index 00000000..e02cb9bf --- /dev/null +++ b/GPy/kern/parts/Matern52.py @@ -0,0 +1,142 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +from kernpart import Kernpart +import numpy as np +import hashlib +from scipy import integrate + +class Matern52(Kernpart): + """ + Matern 5/2 kernel: + + .. math:: + + k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r) \ \ \ \ \ \\text{ where } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} } + + :param input_dim: the number of input dimensions + :type input_dim: int + :param variance: the variance :math:`\sigma^2` + :type variance: float + :param lengthscale: the vector of lengthscale :math:`\ell_i` + :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter) + :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension. + :type ARD: Boolean + :rtype: kernel object + + """ + def __init__(self,input_dim,variance=1.,lengthscale=None,ARD=False): + self.input_dim = input_dim + self.ARD = ARD + if ARD == False: + self.num_params = 2 + self.name = 'Mat52' + if lengthscale is not None: + lengthscale = np.asarray(lengthscale) + assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel" + else: + lengthscale = np.ones(1) + else: + self.num_params = self.input_dim + 1 + self.name = 'Mat52' + if lengthscale is not None: + lengthscale = np.asarray(lengthscale) + assert lengthscale.size == self.input_dim, "bad number of lengthscales" + else: + lengthscale = np.ones(self.input_dim) + self._set_params(np.hstack((variance,lengthscale.flatten()))) + + def _get_params(self): + """return the value of the parameters.""" + return np.hstack((self.variance,self.lengthscale)) + + def _set_params(self,x): + """set the value of the parameters.""" + assert x.size == self.num_params + self.variance = x[0] + self.lengthscale = x[1:] + + def _get_param_names(self): + """return parameter names.""" + if self.num_params == 2: + return ['variance','lengthscale'] + else: + return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)] + + def K(self,X,X2,target): + """Compute the covariance matrix between X and X2.""" + if X2 is None: X2 = X + dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1)) + np.add(self.variance*(1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist), target,target) + + def Kdiag(self,X,target): + """Compute the diagonal of the covariance matrix associated to X.""" + np.add(target,self.variance,target) + + def dK_dtheta(self,dL_dK,X,X2,target): + """derivative of the covariance matrix with respect to the parameters.""" + if X2 is None: X2 = X + dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1)) + invdist = 1./np.where(dist!=0.,dist,np.inf) + dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3 + dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist) + dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis] + target[0] += np.sum(dvar*dL_dK) + if self.ARD: + dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis] + #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis] + target[1:] += (dl*dL_dK[:,:,None]).sum(0).sum(0) + else: + dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist)) * dist2M.sum(-1)*invdist + #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist)) * dist2M.sum(-1)*invdist + target[1] += np.sum(dl*dL_dK) + + def dKdiag_dtheta(self,dL_dKdiag,X,target): + """derivative of the diagonal of the covariance matrix with respect to the parameters.""" + target[0] += np.sum(dL_dKdiag) + + def dK_dX(self,dL_dK,X,X2,target): + """derivative of the covariance matrix with respect to X.""" + if X2 is None: X2 = X + dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None] + ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf) + dK_dX = - np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2)) + target += np.sum(dK_dX*dL_dK.T[:,:,None],0) + + def dKdiag_dX(self,dL_dKdiag,X,target): + pass + + def Gram_matrix(self,F,F1,F2,F3,lower,upper): + """ + Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1. + + :param F: vector of functions + :type F: np.array + :param F1: vector of derivatives of F + :type F1: np.array + :param F2: vector of second derivatives of F + :type F2: np.array + :param F3: vector of third derivatives of F + :type F3: np.array + :param lower,upper: boundaries of the input domain + :type lower,upper: floats + """ + assert self.input_dim == 1 + def L(x,i): + return(5*np.sqrt(5)/self.lengthscale**3*F[i](x) + 15./self.lengthscale**2*F1[i](x)+ 3*np.sqrt(5)/self.lengthscale*F2[i](x) + F3[i](x)) + n = F.shape[0] + G = np.zeros((n,n)) + for i in range(n): + for j in range(i,n): + G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0] + G_coef = 3.*self.lengthscale**5/(400*np.sqrt(5)) + Flower = np.array([f(lower) for f in F])[:,None] + F1lower = np.array([f(lower) for f in F1])[:,None] + F2lower = np.array([f(lower) for f in F2])[:,None] + orig = 9./8*np.dot(Flower,Flower.T) + 9.*self.lengthscale**4/200*np.dot(F2lower,F2lower.T) + orig2 = 3./5*self.lengthscale**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T)) + return(1./self.variance* (G_coef*G + orig + orig2)) + + + diff --git a/GPy/kern/parts/__init__.py b/GPy/kern/parts/__init__.py new file mode 100644 index 00000000..053e280f --- /dev/null +++ b/GPy/kern/parts/__init__.py @@ -0,0 +1,26 @@ +import bias +import Brownian +import coregionalise +import exponential +import finite_dimensional +import fixed +import gibbs +import independent_outputs +import linear +import Matern32 +import Matern52 +import mlp +import periodic_exponential +import periodic_Matern32 +import periodic_Matern52 +import poly +import prod_orthogonal +import prod +import rational_quadratic +import rbfcos +import rbf +import spline +import symmetric +import white +import hierarchical +import rbf_inv diff --git a/GPy/kern/parts/bias.py b/GPy/kern/parts/bias.py new file mode 100644 index 00000000..2b72e7c9 --- /dev/null +++ b/GPy/kern/parts/bias.py @@ -0,0 +1,89 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +from kernpart import Kernpart +import numpy as np +import hashlib + +class Bias(Kernpart): + def __init__(self,input_dim,variance=1.): + """ + :param input_dim: the number of input dimensions + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + """ + self.input_dim = input_dim + self.num_params = 1 + self.name = 'bias' + self._set_params(np.array([variance]).flatten()) + + def _get_params(self): + return self.variance + + def _set_params(self,x): + assert x.shape==(1,) + self.variance = x + + def _get_param_names(self): + return ['variance'] + + def K(self,X,X2,target): + target += self.variance + + def Kdiag(self,X,target): + target += self.variance + + def dK_dtheta(self,dL_dKdiag,X,X2,target): + target += dL_dKdiag.sum() + + def dKdiag_dtheta(self,dL_dKdiag,X,target): + target += dL_dKdiag.sum() + + def dK_dX(self, dL_dK,X, X2, target): + pass + + def dKdiag_dX(self,dL_dKdiag,X,target): + pass + + #---------------------------------------# + # PSI statistics # + #---------------------------------------# + + def psi0(self, Z, mu, S, target): + target += self.variance + + def psi1(self, Z, mu, S, target): + self._psi1 = self.variance + target += self._psi1 + + def psi2(self, Z, mu, S, target): + target += self.variance**2 + + def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S, target): + target += dL_dpsi0.sum() + + def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target): + target += dL_dpsi1.sum() + + def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target): + target += 2.*self.variance*dL_dpsi2.sum() + + def dpsi0_dZ(self, dL_dpsi0, Z, mu, S, target): + pass + + def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S): + pass + + def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target): + pass + + def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S): + pass + + def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target): + pass + + def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S): + pass diff --git a/GPy/kern/parts/coregionalise.py b/GPy/kern/parts/coregionalise.py new file mode 100644 index 00000000..94179088 --- /dev/null +++ b/GPy/kern/parts/coregionalise.py @@ -0,0 +1,160 @@ +# Copyright (c) 2012, James Hensman and Ricardo Andrade +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +from kernpart import Kernpart +import numpy as np +from GPy.util.linalg import mdot, pdinv +import pdb +from scipy import weave + +class Coregionalise(Kernpart): + """ + Coregionalisation kernel. + + Used for computing covariance functions of the form + .. math:: + k_2(x, y)=B k(x, y) + where + .. math:: + B = WW^\top + diag(kappa) + + :param output_dim: the number of output dimensions + :type output_dim: int + :param rank: the rank of the coregionalisation matrix. + :type rank: int + :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B. + :type W: ndarray + :param kappa: a diagonal term which allows the outputs to behave independently. + :rtype: kernel object + + .. Note: see coregionalisation examples in GPy.examples.regression for some usage. + """ + def __init__(self,output_dim,rank=1, W=None, kappa=None): + self.input_dim = 1 + self.name = 'coregion' + self.output_dim = output_dim + self.rank = rank + if W is None: + self.W = 0.5*np.random.randn(self.output_dim,self.rank)/np.sqrt(self.rank) + else: + assert W.shape==(self.output_dim,self.rank) + self.W = W + if kappa is None: + kappa = 0.5*np.ones(self.output_dim) + else: + assert kappa.shape==(self.output_dim,) + self.kappa = kappa + self.num_params = self.output_dim*(self.rank + 1) + self._set_params(np.hstack([self.W.flatten(),self.kappa])) + + def _get_params(self): + return np.hstack([self.W.flatten(),self.kappa]) + + def _set_params(self,x): + assert x.size == self.num_params + self.kappa = x[-self.output_dim:] + self.W = x[:-self.output_dim].reshape(self.output_dim,self.rank) + self.B = np.dot(self.W,self.W.T) + np.diag(self.kappa) + + def _get_param_names(self): + return sum([['W%i_%i'%(i,j) for j in range(self.rank)] for i in range(self.output_dim)],[]) + ['kappa_%i'%i for i in range(self.output_dim)] + + def K(self,index,index2,target): + index = np.asarray(index,dtype=np.int) + + #here's the old code (numpy) + #if index2 is None: + #index2 = index + #else: + #index2 = np.asarray(index2,dtype=np.int) + #false_target = target.copy() + #ii,jj = np.meshgrid(index,index2) + #ii,jj = ii.T, jj.T + #false_target += self.B[ii,jj] + + if index2 is None: + code=""" + for(int i=0;i