Merge branch 'master' of github.com:SheffieldML/GPy

This commit is contained in:
James Hensman 2012-12-17 11:17:10 +00:00
commit 3c708b3bd6
10 changed files with 387 additions and 26 deletions

3
.gitignore vendored
View file

@ -36,3 +36,6 @@ nosetests.xml
#vim #vim
*.swp *.swp
#bfgs optimiser leaves this lying around
iterate.dat

View file

@ -227,8 +227,8 @@ class model(parameterised):
start = self.extract_param() start = self.extract_param()
optimizer = optimization.get_optimizer(optimizer) optimizer = optimization.get_optimizer(optimizer)
opt = optimizer(start, f_fp=f_fp,f=f,fp=fp, **kwargs) opt = optimizer(start, model = self, **kwargs)
opt.run() opt.run(f_fp=f_fp, f=f, fp=fp)
self.optimization_runs.append(opt) self.optimization_runs.append(opt)
self.expand_param(opt.x_opt) self.expand_param(opt.x_opt)

View file

@ -19,15 +19,12 @@ class Optimizer():
:param messages: print messages from the optimizer? :param messages: print messages from the optimizer?
:type messages: (True | False) :type messages: (True | False)
:param max_f_eval: maximum number of function evaluations :param max_f_eval: maximum number of function evaluations
:rtype: optimizer object. :rtype: optimizer object.
""" """
def __init__(self, x_init, f_fp, f, fp , messages=False, max_f_eval=1e4, ftol=None, gtol=None, xtol=None): def __init__(self, x_init, messages=False, model = None, max_f_eval=1e4, ftol=None, gtol=None, xtol=None):
self.opt_name = None self.opt_name = None
self.f_fp = f_fp
self.f = f
self.fp = fp
self.x_init = x_init self.x_init = x_init
self.messages = messages self.messages = messages
self.f_opt = None self.f_opt = None
@ -40,14 +37,15 @@ class Optimizer():
self.xtol = xtol self.xtol = xtol
self.gtol = gtol self.gtol = gtol
self.ftol = ftol self.ftol = ftol
self.model = model
def run(self):
def run(self, **kwargs):
start = dt.datetime.now() start = dt.datetime.now()
self.opt() self.opt(**kwargs)
end = dt.datetime.now() end = dt.datetime.now()
self.time = str(end-start) self.time = str(end-start)
def opt(self): def opt(self, f_fp = None, f = None, fp = None):
raise NotImplementedError, "this needs to be implemented to use the optimizer class" raise NotImplementedError, "this needs to be implemented to use the optimizer class"
def plot(self): def plot(self):
@ -71,7 +69,7 @@ class opt_tnc(Optimizer):
Optimizer.__init__(self, *args, **kwargs) Optimizer.__init__(self, *args, **kwargs)
self.opt_name = "TNC (Scipy implementation)" self.opt_name = "TNC (Scipy implementation)"
def opt(self): def opt(self, f_fp = None, f = None, fp = None):
""" """
Run the TNC optimizer Run the TNC optimizer
@ -79,7 +77,7 @@ class opt_tnc(Optimizer):
tnc_rcstrings = ['Local minimum', 'Converged', 'XConverged', 'Maximum number of f evaluations reached', tnc_rcstrings = ['Local minimum', 'Converged', 'XConverged', 'Maximum number of f evaluations reached',
'Line search failed', 'Function is constant'] 'Line search failed', 'Function is constant']
assert self.f_fp != None, "TNC requires f_fp" assert f_fp != None, "TNC requires f_fp"
opt_dict = {} opt_dict = {}
if self.xtol is not None: if self.xtol is not None:
@ -89,10 +87,10 @@ class opt_tnc(Optimizer):
if self.gtol is not None: if self.gtol is not None:
opt_dict['pgtol'] = self.gtol opt_dict['pgtol'] = self.gtol
opt_result = optimize.fmin_tnc(self.f_fp, self.x_init, messages = self.messages, opt_result = optimize.fmin_tnc(f_fp, self.x_init, messages = self.messages,
maxfun = self.max_f_eval, **opt_dict) maxfun = self.max_f_eval, **opt_dict)
self.x_opt = opt_result[0] self.x_opt = opt_result[0]
self.f_opt = self.f_fp(self.x_opt)[0] self.f_opt = f_fp(self.x_opt)[0]
self.funct_eval = opt_result[1] self.funct_eval = opt_result[1]
self.status = tnc_rcstrings[opt_result[2]] self.status = tnc_rcstrings[opt_result[2]]
@ -101,14 +99,14 @@ class opt_lbfgsb(Optimizer):
Optimizer.__init__(self, *args, **kwargs) Optimizer.__init__(self, *args, **kwargs)
self.opt_name = "L-BFGS-B (Scipy implementation)" self.opt_name = "L-BFGS-B (Scipy implementation)"
def opt(self): def opt(self, f_fp = None, f = None, fp = None):
""" """
Run the optimizer Run the optimizer
""" """
rcstrings = ['Converged', 'Maximum number of f evaluations reached', 'Error'] rcstrings = ['Converged', 'Maximum number of f evaluations reached', 'Error']
assert self.f_fp != None, "BFGS requires f_fp" assert f_fp != None, "BFGS requires f_fp"
if self.messages: if self.messages:
iprint = 1 iprint = 1
@ -123,10 +121,10 @@ class opt_lbfgsb(Optimizer):
if self.gtol is not None: if self.gtol is not None:
opt_dict['pgtol'] = self.gtol opt_dict['pgtol'] = self.gtol
opt_result = optimize.fmin_l_bfgs_b(self.f_fp, self.x_init, iprint = iprint, opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint = iprint,
maxfun = self.max_f_eval, **opt_dict) maxfun = self.max_f_eval, **opt_dict)
self.x_opt = opt_result[0] self.x_opt = opt_result[0]
self.f_opt = self.f_fp(self.x_opt)[0] self.f_opt = f_fp(self.x_opt)[0]
self.funct_eval = opt_result[2]['funcalls'] self.funct_eval = opt_result[2]['funcalls']
self.status = rcstrings[opt_result[2]['warnflag']] self.status = rcstrings[opt_result[2]['warnflag']]
@ -135,7 +133,7 @@ class opt_simplex(Optimizer):
Optimizer.__init__(self, *args, **kwargs) Optimizer.__init__(self, *args, **kwargs)
self.opt_name = "Nelder-Mead simplex routine (via Scipy)" self.opt_name = "Nelder-Mead simplex routine (via Scipy)"
def opt(self): def opt(self, f_fp = None, f = None, fp = None):
""" """
The simplex optimizer does not require gradients. The simplex optimizer does not require gradients.
""" """
@ -150,7 +148,7 @@ class opt_simplex(Optimizer):
if self.gtol is not None: if self.gtol is not None:
print "WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it" print "WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it"
opt_result = optimize.fmin(self.f, self.x_init, (), disp = self.messages, opt_result = optimize.fmin(f, self.x_init, (), disp = self.messages,
maxfun = self.max_f_eval, full_output=True, **opt_dict) maxfun = self.max_f_eval, full_output=True, **opt_dict)
self.x_opt = opt_result[0] self.x_opt = opt_result[0]

View file

@ -2,5 +2,5 @@
# Licensed under the BSD 3-clause license (see LICENSE.txt) # Licensed under the BSD 3-clause license (see LICENSE.txt)
from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, rbf_ARD, spline, Brownian, linear_ARD from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, rbf_ARD, spline, Brownian, linear_ARD, rbf_sympy
from kern import kern from kern import kern

View file

@ -21,6 +21,7 @@ from Brownian import Brownian as Brownianpart
#TODO these s=constructors are not as clean as we'd like. Tidy the code up #TODO these s=constructors are not as clean as we'd like. Tidy the code up
#using meta-classes to make the objects construct properly wthout them. #using meta-classes to make the objects construct properly wthout them.
def rbf(D,variance=1., lengthscale=1.): def rbf(D,variance=1., lengthscale=1.):
""" """
Construct an RBF kernel Construct an RBF kernel
@ -170,3 +171,20 @@ def Brownian(D,variance=1.):
""" """
part = Brownianpart(D,variance) part = Brownianpart(D,variance)
return kern(D, [part]) return kern(D, [part])
import sympy as sp
from sympykern import spkern
from sympy.parsing.sympy_parser import parse_expr
def rbf_sympy(D,variance=1., lengthscale=1.):
"""
Radial Basis Function covariance.
"""
X = [sp.var('x%i'%i) for i in range(D)]
Z = [sp.var('z%i'%i) for i in range(D)]
rbf_variance = sp.var('rbf_variance',positive=True)
rbf_lengthscale = sp.var('rbf_lengthscale',positive=True)
dist_string = ' + '.join(['(x%i-z%i)**2'%(i,i) for i in range(D)])
dist = parse_expr(dist_string)
f = rbf_variance*sp.exp(-dist/(2*rbf_lengthscale**2))
return kern(D,[spkern(D,f,np.array([variance,lengthscale]))])

View file

@ -0,0 +1,10 @@
#include <math.h>
double DiracDelta(double x){
if((x<0.000001) & (x>-0.000001))//go on, laught at my c++ skills
return 1.0;
else
return 0.0;
};
double DiracDelta(double x,int foo){
return 0.0;
};

3
GPy/kern/sympy_helpers.h Normal file
View file

@ -0,0 +1,3 @@
#include <math.h>
double DiracDelta(double x);
double DiracDelta(double x, int foo);

286
GPy/kern/sympykern.py Normal file
View file

@ -0,0 +1,286 @@
import numpy as np
import sympy as sp
from sympy.utilities.codegen import codegen
from sympy.core.cache import clear_cache
from scipy import weave
import re
import os
import sys
current_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
import tempfile
import pdb
from kernpart import kernpart
class spkern(kernpart):
"""
A kernel object, where all the hard work in done by sympy.
:param k: the covariance function
:type k: a positive definite sympy function of x1, z1, x2, z2...
To construct a new sympy kernel, you'll need to define:
- a kernel function using a sympy object. Ensure that the kernel is of the form k(x,z).
- that's it! we'll extract the variables from the function k.
Note:
- to handle multiple inputs, call them x1, z1, etc
- to handle multpile correlated outputs, you'll need to define each covariance function and 'cross' variance function. TODO
"""
def __init__(self,D,k,param=None):
self.name='sympykern'
self._sp_k = k
sp_vars = [e for e in k.atoms() if e.is_Symbol]
self._sp_x= sorted([e for e in sp_vars if e.name[0]=='x'],key=lambda x:int(x.name[1:]))
self._sp_z= sorted([e for e in sp_vars if e.name[0]=='z'],key=lambda z:int(z.name[1:]))
assert all([x.name=='x%i'%i for i,x in enumerate(self._sp_x)])
assert all([z.name=='z%i'%i for i,z in enumerate(self._sp_z)])
assert len(self._sp_x)==len(self._sp_z)
self.D = len(self._sp_x)
assert self.D == D
self._sp_theta = sorted([e for e in sp_vars if not (e.name[0]=='x' or e.name[0]=='z')],key=lambda e:e.name)
self.Nparam = len(self._sp_theta)
#deal with param
if param is None:
param = np.ones(self.Nparam)
assert param.size==self.Nparam
self.set_param(param)
#Differentiate!
self._sp_dk_dtheta = [sp.diff(k,theta) for theta in self._sp_theta]
self._sp_dk_dx = [sp.diff(k,xi) for xi in self._sp_x]
#self._sp_dk_dz = [sp.diff(k,zi) for zi in self._sp_z]
#self.compute_psi_stats()
self._gen_code()
self.weave_kwargs = {\
'support_code':self._function_code,\
'include_dirs':[tempfile.gettempdir(), os.path.join(current_dir,'kern/')],\
'headers':['"sympy_helpers.h"'],\
'sources':[os.path.join(current_dir,"kern/sympy_helpers.cpp")],\
#'extra_compile_args':['-ftree-vectorize', '-mssse3', '-ftree-vectorizer-verbose=5'],\
'extra_compile_args':[],\
'extra_link_args':['-lgomp'],\
'verbose':True}
def __add__(self,other):
return spkern(self._sp_k+other._sp_k)
def compute_psi_stats(self):
#define some normal distributions
mus = [sp.var('mu%i'%i,real=True) for i in range(self.D)]
Ss = [sp.var('S%i'%i,positive=True) for i in range(self.D)]
normals = [(2*sp.pi*Si)**(-0.5)*sp.exp(-0.5*(xi-mui)**2/Si) for xi, mui, Si in zip(self._sp_x, mus, Ss)]
#do some integration!
#self._sp_psi0 = ??
self._sp_psi1 = self._sp_k
for i in range(self.D):
print 'perfoming integrals %i of %i'%(i+1,2*self.D)
sys.stdout.flush()
self._sp_psi1 *= normals[i]
self._sp_psi1 = sp.integrate(self._sp_psi1,(self._sp_x[i],-sp.oo,sp.oo))
clear_cache()
self._sp_psi1 = self._sp_psi1.simplify()
#and here's psi2 (eek!)
zprime = [sp.Symbol('zp%i'%i) for i in range(self.D)]
self._sp_psi2 = self._sp_k.copy()*self._sp_k.copy().subs(zip(self._sp_z,zprime))
for i in range(self.D):
print 'perfoming integrals %i of %i'%(self.D+i+1,2*self.D)
sys.stdout.flush()
self._sp_psi2 *= normals[i]
self._sp_psi2 = sp.integrate(self._sp_psi2,(self._sp_x[i],-sp.oo,sp.oo))
clear_cache()
self._sp_psi2 = self._sp_psi2.simplify()
def _gen_code(self):
#generate c functions from sympy objects
(foo_c,self._function_code),(foo_h,self._function_header) = \
codegen([('k',self._sp_k)] \
+ [('dk_d%s'%x.name,dx) for x,dx in zip(self._sp_x,self._sp_dk_dx)]\
#+ [('dk_d%s'%z.name,dz) for z,dz in zip(self._sp_z,self._sp_dk_dz)]\
+ [('dk_d%s'%theta.name,dtheta) for theta,dtheta in zip(self._sp_theta,self._sp_dk_dtheta)]\
,"C",'foobar',argument_sequence=self._sp_x+self._sp_z+self._sp_theta)
#put the header file where we can find it
f = file(os.path.join(tempfile.gettempdir(),'foobar.h'),'w')
f.write(self._function_header)
f.close()
#get rid of derivatives of DiracDelta
self._function_code = re.sub('DiracDelta\(.+?,.+?\)','0.0',self._function_code)
#Here's some code to do the looping for K
arglist = ", ".join(["X[i*D+%s]"%x.name[1:] for x in self._sp_x]\
+ ["Z[j*D+%s]"%z.name[1:] for z in self._sp_z]\
+ ["param[%i]"%i for i in range(self.Nparam)])
self._K_code =\
"""
int i;
int j;
int N = target_array->dimensions[0];
int M = target_array->dimensions[1];
int D = X_array->dimensions[1];
#pragma omp parallel for private(j)
for (i=0;i<N;i++){
for (j=0;j<M;j++){
target[i*M+j] = k(%s);
}
}
%s
"""%(arglist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
diag_arglist = re.sub('Z','X',arglist)
diag_arglist = re.sub('j','i',diag_arglist)
#Here's some code to do the looping for Kdiag
self._Kdiag_code =\
"""
int i;
int N = target_array->dimensions[0];
int D = X_array->dimensions[1];
#pragma omp parallel for
for (i=0;i<N;i++){
target[i] = k(%s);
}
%s
"""%(diag_arglist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
#here's some code to compute gradients
funclist = '\n'.join([' '*16 + 'target[%i] += partial[i*M+j]*dk_d%s(%s);'%(i,theta.name,arglist) for i,theta in enumerate(self._sp_theta)])
self._dK_dtheta_code =\
"""
int i;
int j;
int N = partial_array->dimensions[0];
int M = partial_array->dimensions[1];
int D = X_array->dimensions[1];
#pragma omp parallel for private(j)
for (i=0;i<N;i++){
for (j=0;j<M;j++){
%s
}
}
%s
"""%(funclist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
#here's some code to compute gradients for Kdiag TODO: thius is yucky.
diag_funclist = re.sub('Z','X',funclist,count=0)
diag_funclist = re.sub('j','i',diag_funclist)
diag_funclist = re.sub('partial\[i\*M\+i\]','partial[i]',diag_funclist)
self._dKdiag_dtheta_code =\
"""
int i;
int N = partial_array->dimensions[0];
int D = X_array->dimensions[1];
for (i=0;i<N;i++){
%s
}
%s
"""%(diag_funclist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
#Here's some code to do gradients wrt x
gradient_funcs = "\n".join(["target[i*D+%i] += partial[i*M+j]*dk_dx%i(%s);"%(q,q,arglist) for q in range(self.D)])
self._dK_dX_code = \
"""
int i;
int j;
int N = partial_array->dimensions[0];
int M = partial_array->dimensions[1];
int D = X_array->dimensions[1];
#pragma omp parallel for private(j)
for (i=0;i<N; i++){
for (j=0; j<M; j++){
%s
}
}
%s
"""%(gradient_funcs,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
#Here's some code to do gradients wrt z (should be the same as for X, but this is easier
gradient_funcs_Z = "\n".join(["target[j*D+%i] += partial[i*M+j]*dk_dz%i(%s);"%(q,q,arglist) for q in range(self.D)])
self._dK_dZ_code = \
"""
int i;
int j;
int N = partial_array->dimensions[0];
int M = partial_array->dimensions[1];
int D = X_array->dimensions[1];
for (i=0;i<N; i++){
for (j=0; j<M; j++){
%s
}
}
%s
"""%(gradient_funcs_Z,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
#now for gradients of Kdiag wrt X
self._dKdiag_dX_code= \
"""
int i;
int j;
int N = partial_array->dimensions[0];
int M = 0;
int D = X_array->dimensions[1];
for (i=0;i<N; i++){
j = i;
%s
}
%s
"""%(gradient_funcs,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
#TODO: insert multiple functions here via string manipulation
#TODO: similar functions for psi_stats
def K(self,X,Z,target):
param = self._param
weave.inline(self._K_code,arg_names=['target','X','Z','param'],**self.weave_kwargs)
return target
def Kdiag(self,X,target):
param = self._param
weave.inline(self._Kdiag_code,arg_names=['target','X','param'],**self.weave_kwargs)
return target
def dK_dtheta(self,partial,X,Z,target):
param = self._param
weave.inline(self._dK_dtheta_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
return target
def dKdiag_dtheta(self,partial,X,target):
param = self._param
Z = X
weave.inline(self._dKdiag_dtheta_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
return target
def dK_dX(self,partial,X,Z,target):
target = np.zeros_like(X)
param = self._param
weave.inline(self._dK_dX_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
return target
#def dK_dZ(self,X,Z,partial=None):
##TODO: this function might not be necessary
#target = np.zeros_like(Z)
#param = self._param
#weave.inline(self._dK_dZ_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
#return target
def dKdiag_dX(self,partial,X,target):
param = self._param
Z = X
weave.inline(self._dKdiag_dX_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
return target
def set_param(self,param):
#print param.flags['C_CONTIGUOUS']
self._param = param.copy()
def get_param(self):
return self._param
def get_param_names(self):
return [x.name for x in self._sp_theta]

View file

@ -22,8 +22,9 @@ class GPLVM(GP_regression):
:type init: 'PCA'|'random' :type init: 'PCA'|'random'
""" """
def __init__(self, Y, Q, init='PCA', **kwargs): def __init__(self, Y, Q, init='PCA', X = None, **kwargs):
X = self.initialise_latent(init, Q, Y) if X is None:
X = self.initialise_latent(init, Q, Y)
GP_regression.__init__(self, X, Y, **kwargs) GP_regression.__init__(self, X, Y, **kwargs)
def initialise_latent(self, init, Q, Y): def initialise_latent(self, init, Q, Y):

42
GPy/notes.txt Normal file
View file

@ -0,0 +1,42 @@
Fails in weird ways if you pass a integer as the input instead of a double to the kernel.
The Matern kernels (at least the 52) still is working in the ARD manner which means it wouldn't run for very large input dimension. Needs to be fixed to match the RBF.
Implementing new covariances is too complicated at the moment. We need a barebones example of what to implement and where. Commenting in the covariance matrices needs to be improved. It's not clear to a user what all the psi parts are for. Maybe we need a cut down and simplified example to help with this (perhaps a cut down version of the RBF?). And then we should provide a simple list of what you need to do to get a new kernel going.
Missing kernels: polynomial, rational quadratic.
Kernel implementations are far to obscure. Need to be easily readable for a first time user.
Need an implementation of scaled conjugate gradients for the optimizers.
Need an implementation of gradient descent for the optimizers (works well with GP-LVM for small random initializations)
Need Carl Rasmussen's permission to add his conjugate gradients algorithm. In fact, we can just provide a hook for it, and post a separate python implementation of his algorithm.
Change get_param and set_param to get_params and set_params
Get constrain param by default inside model creation.
Randomize doesn't seem to cover a wide enough range for restarts ... try it for a model where inputs are widely spaced apart and length scale is too short. Sampling from N(0,1) is too conservative. Dangerous for people who naively use restarts. Since we have the model we could maybe come up with some sensible heuristics for setting these things. Maybe we should also consider having '.initialize()'. If we can't do this well we should disable the restart method.
Tolerances for optimizers, do we need to introduce some standardization? At the moment does each have its own defaults?
Do all optimizers work only in terms of function evaluations? Do we need to check for one that uses iterations?
Change Youter to YYT (Youter doesn't mean anything for matrices).
Bug when running classification.crescent_data()
A dictionary for parameter storage? So we can go through names easily?
A flag on covariance functions that indicates when they are not associated with an underlying function (like white noise or a coregionalization matrix).
Diagonal noise covariance function
Long term: automatic Lagrange multiplier calculation for optimizers: constrain two parameters in an unusual way and the model automatically does the Lagrangian. Also augment the parameters with new ones, so define data variance to be white noise plus RBF variance and optimize over that and signal to noise ratio ... for example constrain the sum of variances to equal the known variance of the data.
When computing kernel.K for kernels like rbf, you can't compute a version with rbf.K(X) you have to do rbf.K(X, X)
the predict method for GP_regression returns a covariance matrix which is a bad idea as this takes a lot to compute, it's also confusing for first time users. Should only be returned if the user explicitly requests it.