add the regression benchmark

This commit is contained in:
Zhenwen Dai 2015-08-27 17:04:52 +01:00
parent f221a3b1fa
commit 60aa865631
7 changed files with 310 additions and 2 deletions

View file

@ -5,3 +5,4 @@ Nicolas Durrande
Alan Saul
Max Zwiessele
Neil D. Lawrence
Zhenwen Dai

View file

@ -4,6 +4,7 @@
import numpy as np
from ...core import Model
from ...core.parameterization import variational
from GPy.core.parameterization.variational import VariationalPosterior
def infer_newX(model, Y_new, optimize=True, init='L2'):
"""
@ -60,7 +61,8 @@ class InferenceX(Model):
# self.kern.GPU(True)
from copy import deepcopy
self.posterior = deepcopy(model.posterior)
if hasattr(model, 'variational_prior'):
from ...core.parameterization.variational import VariationalPosterior
if isinstance(model.X, VariationalPosterior):
self.uncertain_input = True
from ...models.ss_gplvm import IBPPrior
from ...models.ss_mrd import IBPPrior_SSMRD
@ -71,7 +73,7 @@ class InferenceX(Model):
self.variational_prior = model.variational_prior.copy()
else:
self.uncertain_input = False
if hasattr(model, 'inducing_inputs'):
if hasattr(model, 'Z'):
self.sparse_gp = True
self.Z = model.Z.copy()
else:

View file

@ -0,0 +1,21 @@
# Copyright (c) 2015, Zhenwen Dai
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import abc
import numpy as np
class Evaluation(object):
__metaclass__ = abc.ABCMeta
@abc.abstractmethod
def evaluate(self, gt, pred):
"""Compute a scalar for access the performance"""
return None
class RMSE(Evaluation):
"Rooted Mean Square Error"
name = 'RMSE'
def evaluate(self, gt, pred):
return np.sqrt(np.square(gt-pred).astype(np.float).mean())

View file

@ -0,0 +1,81 @@
# Copyright (c) 2015, Zhenwen Dai
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import abc
import numpy as np
import GPy
class RegressionMethod(object):
__metaclass__ = abc.ABCMeta
def __init__(self):
self.preprocess = True
def _preprocess(self, data, train):
"""Zero-mean, unit-variance normalization by default"""
if train:
inputs, labels = data
self.data_mean = inputs.mean(axis=0)
self.data_std = inputs.std(axis=0)
self.labels_mean = labels.mean(axis=0)
self.labels_std = labels.std(axis=0)
return ((inputs-self.data_mean)/self.data_std, (labels-self.labels_mean)/self.labels_std)
else:
return (data-self.data_mean)/self.data_std
def _reverse_trans_labels(self, labels):
return labels*self.labels_std+self.labels_mean
def fit(self, train_data):
if self.preprocess:
train_data = self._preprocess(train_data, True)
return self._fit(train_data)
def predict(self, test_data):
if self.preprocess:
test_data = self._preprocess(test_data, False)
labels = self._predict(test_data)
if self.preprocess:
labels = self._reverse_trans_labels(labels)
return labels
@abc.abstractmethod
def _fit(self, train_data):
"""Fit the model. Return True if successful"""
return True
@abc.abstractmethod
def _predict(self, test_data):
"""Predict on test data"""
return None
class GP_RBF(RegressionMethod):
name = 'GP_RBF'
def _fit(self, train_data):
inputs, labels = train_data
self.model = GPy.models.GPRegression(inputs, labels,kernel=GPy.kern.RBF(inputs.shape[-1],ARD=True) +GPy.kern.Linear(inputs.shape[1], ARD=True) + GPy.kern.White(inputs.shape[1],0.01) )
self.model.likelihood.variance[:] = labels.var()*0.01
self.model.optimize()
return True
def _predict(self, test_data):
return self.model.predict(test_data)[0]
class SVIGP_RBF(RegressionMethod):
name = 'SVIGP_RBF'
def _fit(self, train_data):
X, Y = train_data
Z = X[np.random.permutation(X.shape[0])[:100]]
k = GPy.kern.RBF(X.shape[1], ARD=True) + GPy.kern.Linear(X.shape[1], ARD=True) + GPy.kern.White(X.shape[1],0.01)
lik = GPy.likelihoods.StudentT(deg_free=3.)
self.model = GPy.core.SVGP(X, Y, Z=Z, kernel=k, likelihood=lik)
[self.model.optimize('scg', max_iters=40, gtol=0, messages=0, xtol=0, ftol=0) for i in range(10)]
self.model.optimize('bfgs', max_iters=1000, gtol=0, messages=0)
return True
def _predict(self, test_data):
return self.model.predict(test_data)[0]

View file

@ -0,0 +1,64 @@
# Copyright (c) 2015, Zhenwen Dai
# Licensed under the BSD 3-clause license (see LICENSE.txt)
from __future__ import print_function
import abc
import os
import numpy as np
class Output(object):
__metaclass__ = abc.ABCMeta
@abc.abstractmethod
def output(self, config, results):
"""Return the test data: training data and labels"""
return None
class ScreenOutput(Output):
def output(self, config, results):
print('='*10+'Report'+'='*10)
print('\t'.join([' ']+[m.name+'('+e+')' for m in config['methods'] for e in [a.name for a in config['evaluations']]+['time']]))
for task_i in range(len(config['tasks'])):
print(config['tasks'][task_i].name+'\t', end='')
outputs = []
for method_i in range(len(config['methods'])):
for ei in range(len(config['evaluations'])+1):
m,s = results[task_i, method_i, ei].mean(), results[task_i, method_i, ei].std()
outputs.append('%e(%e)'%(m,s))
print('\t'.join(outputs))
class CSVOutput(Output):
def __init__(self, outpath, prjname):
self.fname = os.path.join(outpath, prjname+'.csv')
def output(self, config, results):
with open(self.fname,'w') as f:
f.write(','.join([' ']+[m.name+'('+e+')' for m in config['methods'] for e in [a.name for a in config['evaluations']]+['time']])+'\n')
for task_i in range(len(config['tasks'])):
f.write(config['tasks'][task_i].name+',')
outputs = []
for method_i in range(len(config['methods'])):
for ei in range(len(config['evaluations'])+1):
m,s = results[task_i, method_i, ei].mean(), results[task_i, method_i, ei].std()
outputs.append('%e (%e)'%(m,s))
f.write(','.join(outputs)+'\n')
f.close()
class H5Output(Output):
def __init__(self, outpath, prjname):
self.fname = os.path.join(outpath, prjname+'.h5')
def output(self, config, results):
try:
import h5py
f = h5py.File(self.fname,'w')
d = f.create_dataset('results',results.shape, dtype=results.dtype)
d[:] = results
f.close()
except:
raise 'Fails to write the parameters into a HDF5 file!'

View file

@ -0,0 +1,53 @@
# Copyright (c) 2015, Zhenwen Dai
# Licensed under the BSD 3-clause license (see LICENSE.txt)
from __future__ import print_function
from evaluation import RMSE
from methods import GP_RBF, SVIGP_RBF
from tasks import Housing, WineQuality
from outputs import ScreenOutput, CSVOutput, H5Output
import numpy as np
import time
outpath = '.'
prjname = 'regression'
config = {
'evaluations':[RMSE],
'methods':[GP_RBF, SVIGP_RBF],
'tasks':[WineQuality,Housing],
'repeats':2,
'outputs': [ScreenOutput(), CSVOutput(outpath, prjname), H5Output(outpath, prjname)]
}
if __name__=='__main__':
results = np.zeros((len(config['tasks']), len(config['methods']), len(config['evaluations'])+1, config['repeats']))
for task_i in range(len(config['tasks'])):
dataset = config['tasks'][task_i]()
print('Benchmarking on '+dataset.name)
res = dataset.load_data()
if not res: print('Fail to load '+config['tasks'][task_i].name); continue
train = dataset.get_training_data()
test = dataset.get_test_data()
for method_i in range(len(config['methods'])):
method = config['methods'][method_i]
print('With the method '+method.name, end='')
for ri in range(config['repeats']):
m = method()
t_st = time.time()
m.fit(train)
pred = m.predict(test[0])
t_pd = time.time() - t_st
for ei in range(len(config['evaluations'])):
evalu = config['evaluations'][ei]()
results[task_i, method_i, ei, ri] = evalu.evaluate(test[1], pred)
results[task_i, method_i, -1, ri] = t_pd
print('.',end='')
print()
[out.output(config, results) for out in config['outputs']]

View file

@ -0,0 +1,86 @@
# Copyright (c) 2015, Zhenwen Dai
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import abc
import os
import numpy as np
class RegressionTask(object):
__metaclass__ = abc.ABCMeta
def __init__(self, datapath='./'):
self.datapath = datapath
@abc.abstractmethod
def load_data(self):
"""Download the dataset if not exist. Return True if successful"""
return True
@abc.abstractmethod
def get_training_data(self):
"""Return the training data: training data and labels"""
return None
@abc.abstractmethod
def get_test_data(self):
"""Return the test data: training data and labels"""
return None
class Housing(RegressionTask):
name='Housing'
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
filename = 'housing.data'
def load_data(self):
from GPy.util.datasets import download_url, data_path
if not os.path.exists(os.path.join(data_path,self.datapath, self.filename)):
download_url(Housing.url, self.datapath, messages=True)
if not os.path.exists(os.path.join(data_path, self.datapath, self.filename)):
return False
data = np.loadtxt(os.path.join(data_path, self.datapath, self.filename))
self.data = data
data_train = data[:250,:-1]
label_train = data[:250, -1:]
self.train = (data_train, label_train)
data_test = data[250:,:-1]
label_test = data[250:,-1:]
self.test = (data_test, label_test)
return True
def get_training_data(self):
return self.train
def get_test_data(self):
return self.test
class WineQuality(RegressionTask):
name='WineQuality'
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
filename = 'winequality-red.csv'
def load_data(self):
from GPy.util.datasets import download_url, data_path
if not os.path.exists(os.path.join(data_path,self.datapath, self.filename)):
download_url(self.url, self.datapath, messages=True)
if not os.path.exists(os.path.join(data_path, self.datapath, self.filename)):
return False
data = np.loadtxt(os.path.join(data_path, self.datapath, self.filename),skiprows=1,delimiter=';')
self.data = data
data_train = data[:1000,:-1]
label_train = data[:1000, -1:]
self.train = (data_train, label_train)
data_test = data[1000:,:-1]
label_test = data[1000:,-1:]
self.test = (data_test, label_test)
return True
def get_training_data(self):
return self.train
def get_test_data(self):
return self.test