diff --git a/AUTHORS.txt b/AUTHORS.txt index f81db5ec..31efef02 100644 --- a/AUTHORS.txt +++ b/AUTHORS.txt @@ -5,3 +5,4 @@ Nicolas Durrande Alan Saul Max Zwiessele Neil D. Lawrence +Zhenwen Dai diff --git a/GPy/inference/latent_function_inference/inferenceX.py b/GPy/inference/latent_function_inference/inferenceX.py index c1c9fe2b..fd213784 100644 --- a/GPy/inference/latent_function_inference/inferenceX.py +++ b/GPy/inference/latent_function_inference/inferenceX.py @@ -4,6 +4,7 @@ import numpy as np from ...core import Model from ...core.parameterization import variational +from GPy.core.parameterization.variational import VariationalPosterior def infer_newX(model, Y_new, optimize=True, init='L2'): """ @@ -60,7 +61,8 @@ class InferenceX(Model): # self.kern.GPU(True) from copy import deepcopy self.posterior = deepcopy(model.posterior) - if hasattr(model, 'variational_prior'): + from ...core.parameterization.variational import VariationalPosterior + if isinstance(model.X, VariationalPosterior): self.uncertain_input = True from ...models.ss_gplvm import IBPPrior from ...models.ss_mrd import IBPPrior_SSMRD @@ -71,7 +73,7 @@ class InferenceX(Model): self.variational_prior = model.variational_prior.copy() else: self.uncertain_input = False - if hasattr(model, 'inducing_inputs'): + if hasattr(model, 'Z'): self.sparse_gp = True self.Z = model.Z.copy() else: diff --git a/benchmarks/regression/evaluation.py b/benchmarks/regression/evaluation.py new file mode 100644 index 00000000..fbbfe6d7 --- /dev/null +++ b/benchmarks/regression/evaluation.py @@ -0,0 +1,21 @@ +# Copyright (c) 2015, Zhenwen Dai +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import abc +import numpy as np + +class Evaluation(object): + __metaclass__ = abc.ABCMeta + + @abc.abstractmethod + def evaluate(self, gt, pred): + """Compute a scalar for access the performance""" + return None + +class RMSE(Evaluation): + "Rooted Mean Square Error" + name = 'RMSE' + + def evaluate(self, gt, pred): + return np.sqrt(np.square(gt-pred).astype(np.float).mean()) + \ No newline at end of file diff --git a/benchmarks/regression/methods.py b/benchmarks/regression/methods.py new file mode 100644 index 00000000..245562e3 --- /dev/null +++ b/benchmarks/regression/methods.py @@ -0,0 +1,81 @@ +# Copyright (c) 2015, Zhenwen Dai +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import abc +import numpy as np +import GPy + +class RegressionMethod(object): + __metaclass__ = abc.ABCMeta + + def __init__(self): + self.preprocess = True + + def _preprocess(self, data, train): + """Zero-mean, unit-variance normalization by default""" + if train: + inputs, labels = data + self.data_mean = inputs.mean(axis=0) + self.data_std = inputs.std(axis=0) + self.labels_mean = labels.mean(axis=0) + self.labels_std = labels.std(axis=0) + return ((inputs-self.data_mean)/self.data_std, (labels-self.labels_mean)/self.labels_std) + else: + return (data-self.data_mean)/self.data_std + + def _reverse_trans_labels(self, labels): + return labels*self.labels_std+self.labels_mean + + def fit(self, train_data): + if self.preprocess: + train_data = self._preprocess(train_data, True) + return self._fit(train_data) + + def predict(self, test_data): + if self.preprocess: + test_data = self._preprocess(test_data, False) + labels = self._predict(test_data) + if self.preprocess: + labels = self._reverse_trans_labels(labels) + return labels + + @abc.abstractmethod + def _fit(self, train_data): + """Fit the model. Return True if successful""" + return True + + @abc.abstractmethod + def _predict(self, test_data): + """Predict on test data""" + return None + +class GP_RBF(RegressionMethod): + name = 'GP_RBF' + + def _fit(self, train_data): + inputs, labels = train_data + self.model = GPy.models.GPRegression(inputs, labels,kernel=GPy.kern.RBF(inputs.shape[-1],ARD=True) +GPy.kern.Linear(inputs.shape[1], ARD=True) + GPy.kern.White(inputs.shape[1],0.01) ) + self.model.likelihood.variance[:] = labels.var()*0.01 + self.model.optimize() + return True + + def _predict(self, test_data): + return self.model.predict(test_data)[0] + +class SVIGP_RBF(RegressionMethod): + name = 'SVIGP_RBF' + + def _fit(self, train_data): + X, Y = train_data + + Z = X[np.random.permutation(X.shape[0])[:100]] + k = GPy.kern.RBF(X.shape[1], ARD=True) + GPy.kern.Linear(X.shape[1], ARD=True) + GPy.kern.White(X.shape[1],0.01) + + lik = GPy.likelihoods.StudentT(deg_free=3.) + self.model = GPy.core.SVGP(X, Y, Z=Z, kernel=k, likelihood=lik) + [self.model.optimize('scg', max_iters=40, gtol=0, messages=0, xtol=0, ftol=0) for i in range(10)] + self.model.optimize('bfgs', max_iters=1000, gtol=0, messages=0) + return True + + def _predict(self, test_data): + return self.model.predict(test_data)[0] diff --git a/benchmarks/regression/outputs.py b/benchmarks/regression/outputs.py new file mode 100644 index 00000000..2294bbe0 --- /dev/null +++ b/benchmarks/regression/outputs.py @@ -0,0 +1,64 @@ +# Copyright (c) 2015, Zhenwen Dai +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +from __future__ import print_function +import abc +import os +import numpy as np + +class Output(object): + __metaclass__ = abc.ABCMeta + + @abc.abstractmethod + def output(self, config, results): + """Return the test data: training data and labels""" + return None + +class ScreenOutput(Output): + + def output(self, config, results): + print('='*10+'Report'+'='*10) + print('\t'.join([' ']+[m.name+'('+e+')' for m in config['methods'] for e in [a.name for a in config['evaluations']]+['time']])) + for task_i in range(len(config['tasks'])): + print(config['tasks'][task_i].name+'\t', end='') + + outputs = [] + for method_i in range(len(config['methods'])): + for ei in range(len(config['evaluations'])+1): + m,s = results[task_i, method_i, ei].mean(), results[task_i, method_i, ei].std() + outputs.append('%e(%e)'%(m,s)) + print('\t'.join(outputs)) + +class CSVOutput(Output): + + def __init__(self, outpath, prjname): + self.fname = os.path.join(outpath, prjname+'.csv') + + def output(self, config, results): + with open(self.fname,'w') as f: + f.write(','.join([' ']+[m.name+'('+e+')' for m in config['methods'] for e in [a.name for a in config['evaluations']]+['time']])+'\n') + for task_i in range(len(config['tasks'])): + f.write(config['tasks'][task_i].name+',') + + outputs = [] + for method_i in range(len(config['methods'])): + for ei in range(len(config['evaluations'])+1): + m,s = results[task_i, method_i, ei].mean(), results[task_i, method_i, ei].std() + outputs.append('%e (%e)'%(m,s)) + f.write(','.join(outputs)+'\n') + f.close() + +class H5Output(Output): + + def __init__(self, outpath, prjname): + self.fname = os.path.join(outpath, prjname+'.h5') + + def output(self, config, results): + try: + import h5py + f = h5py.File(self.fname,'w') + d = f.create_dataset('results',results.shape, dtype=results.dtype) + d[:] = results + f.close() + except: + raise 'Fails to write the parameters into a HDF5 file!' diff --git a/benchmarks/regression/run.py b/benchmarks/regression/run.py new file mode 100644 index 00000000..ee4160e2 --- /dev/null +++ b/benchmarks/regression/run.py @@ -0,0 +1,53 @@ +# Copyright (c) 2015, Zhenwen Dai +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +from __future__ import print_function +from evaluation import RMSE +from methods import GP_RBF, SVIGP_RBF +from tasks import Housing, WineQuality +from outputs import ScreenOutput, CSVOutput, H5Output +import numpy as np +import time + +outpath = '.' +prjname = 'regression' +config = { + 'evaluations':[RMSE], + 'methods':[GP_RBF, SVIGP_RBF], + 'tasks':[WineQuality,Housing], + 'repeats':2, + 'outputs': [ScreenOutput(), CSVOutput(outpath, prjname), H5Output(outpath, prjname)] + } + +if __name__=='__main__': + results = np.zeros((len(config['tasks']), len(config['methods']), len(config['evaluations'])+1, config['repeats'])) + + for task_i in range(len(config['tasks'])): + dataset = config['tasks'][task_i]() + print('Benchmarking on '+dataset.name) + res = dataset.load_data() + if not res: print('Fail to load '+config['tasks'][task_i].name); continue + train = dataset.get_training_data() + test = dataset.get_test_data() + + for method_i in range(len(config['methods'])): + method = config['methods'][method_i] + print('With the method '+method.name, end='') + for ri in range(config['repeats']): + m = method() + t_st = time.time() + m.fit(train) + pred = m.predict(test[0]) + t_pd = time.time() - t_st + for ei in range(len(config['evaluations'])): + evalu = config['evaluations'][ei]() + results[task_i, method_i, ei, ri] = evalu.evaluate(test[1], pred) + results[task_i, method_i, -1, ri] = t_pd + print('.',end='') + print() + + [out.output(config, results) for out in config['outputs']] + + + + diff --git a/benchmarks/regression/tasks.py b/benchmarks/regression/tasks.py new file mode 100644 index 00000000..9cecbdd8 --- /dev/null +++ b/benchmarks/regression/tasks.py @@ -0,0 +1,86 @@ +# Copyright (c) 2015, Zhenwen Dai +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import abc +import os +import numpy as np + +class RegressionTask(object): + __metaclass__ = abc.ABCMeta + + def __init__(self, datapath='./'): + self.datapath = datapath + + @abc.abstractmethod + def load_data(self): + """Download the dataset if not exist. Return True if successful""" + return True + + @abc.abstractmethod + def get_training_data(self): + """Return the training data: training data and labels""" + return None + + @abc.abstractmethod + def get_test_data(self): + """Return the test data: training data and labels""" + return None + +class Housing(RegressionTask): + + name='Housing' + url = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data" + filename = 'housing.data' + + def load_data(self): + from GPy.util.datasets import download_url, data_path + if not os.path.exists(os.path.join(data_path,self.datapath, self.filename)): + download_url(Housing.url, self.datapath, messages=True) + if not os.path.exists(os.path.join(data_path, self.datapath, self.filename)): + return False + + data = np.loadtxt(os.path.join(data_path, self.datapath, self.filename)) + self.data = data + data_train = data[:250,:-1] + label_train = data[:250, -1:] + self.train = (data_train, label_train) + data_test = data[250:,:-1] + label_test = data[250:,-1:] + self.test = (data_test, label_test) + return True + + def get_training_data(self): + return self.train + + def get_test_data(self): + return self.test + +class WineQuality(RegressionTask): + + name='WineQuality' + url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" + filename = 'winequality-red.csv' + + def load_data(self): + from GPy.util.datasets import download_url, data_path + if not os.path.exists(os.path.join(data_path,self.datapath, self.filename)): + download_url(self.url, self.datapath, messages=True) + if not os.path.exists(os.path.join(data_path, self.datapath, self.filename)): + return False + + data = np.loadtxt(os.path.join(data_path, self.datapath, self.filename),skiprows=1,delimiter=';') + self.data = data + data_train = data[:1000,:-1] + label_train = data[:1000, -1:] + self.train = (data_train, label_train) + data_test = data[1000:,:-1] + label_test = data[1000:,-1:] + self.test = (data_test, label_test) + return True + + def get_training_data(self): + return self.train + + def get_test_data(self): + return self.test +