add the regression benchmark

2026-06-08 15:05:15 +02:00 · 2015-08-27 17:04:52 +01:00 · 2015-08-27 17:04:52 +01:00 · 60aa865631
commit 60aa865631
parent f221a3b1fa
7 changed files with 310 additions and 2 deletions
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@ -5,3 +5,4 @@ Nicolas Durrande
 Alan Saul
 Max Zwiessele
 Neil D. Lawrence
+Zhenwen Dai
--- a/GPy/inference/latent_function_inference/inferenceX.py
+++ b/GPy/inference/latent_function_inference/inferenceX.py
@ -4,6 +4,7 @@
 import numpy as np
 from ...core import Model
 from ...core.parameterization import variational
+from GPy.core.parameterization.variational import VariationalPosterior

 def infer_newX(model, Y_new, optimize=True, init='L2'):
    """
@ -60,7 +61,8 @@ class InferenceX(Model):
 #                 self.kern.GPU(True)
        from copy import deepcopy
        self.posterior = deepcopy(model.posterior)
-        if hasattr(model, 'variational_prior'):
+        from ...core.parameterization.variational import VariationalPosterior
+        if isinstance(model.X, VariationalPosterior):
            self.uncertain_input = True
            from ...models.ss_gplvm import IBPPrior
            from ...models.ss_mrd import IBPPrior_SSMRD
@ -71,7 +73,7 @@ class InferenceX(Model):
                self.variational_prior = model.variational_prior.copy()
        else:
            self.uncertain_input = False
-        if hasattr(model, 'inducing_inputs'):
+        if hasattr(model, 'Z'):
            self.sparse_gp = True
            self.Z = model.Z.copy()
        else:
--- a/benchmarks/regression/evaluation.py
+++ b/benchmarks/regression/evaluation.py
@ -0,0 +1,21 @@
+# Copyright (c) 2015, Zhenwen Dai
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import abc
+import numpy as np
+
+class Evaluation(object):
+    __metaclass__ = abc.ABCMeta
+    
+    @abc.abstractmethod
+    def evaluate(self, gt, pred):
+        """Compute a scalar for access the performance"""
+        return None
+
+class RMSE(Evaluation):
+    "Rooted Mean Square Error"
+    name = 'RMSE'
+    
+    def evaluate(self, gt, pred):
+        return np.sqrt(np.square(gt-pred).astype(np.float).mean())
+    
--- a/benchmarks/regression/methods.py
+++ b/benchmarks/regression/methods.py
@ -0,0 +1,81 @@
+# Copyright (c) 2015, Zhenwen Dai
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import abc
+import numpy as np
+import GPy
+
+class RegressionMethod(object):
+    __metaclass__ = abc.ABCMeta
+    
+    def __init__(self):
+        self.preprocess = True
+        
+    def _preprocess(self, data,  train):
+        """Zero-mean, unit-variance normalization by default"""
+        if train:
+            inputs, labels = data
+            self.data_mean = inputs.mean(axis=0)
+            self.data_std = inputs.std(axis=0)
+            self.labels_mean = labels.mean(axis=0)
+            self.labels_std = labels.std(axis=0)
+            return ((inputs-self.data_mean)/self.data_std, (labels-self.labels_mean)/self.labels_std)
+        else:
+            return (data-self.data_mean)/self.data_std
+    
+    def _reverse_trans_labels(self, labels):
+        return labels*self.labels_std+self.labels_mean
+        
+    def fit(self, train_data):
+        if self.preprocess:
+            train_data = self._preprocess(train_data, True)
+        return self._fit(train_data)
+    
+    def predict(self, test_data):
+        if self.preprocess:
+            test_data = self._preprocess(test_data, False)
+        labels = self._predict(test_data)
+        if self.preprocess:
+            labels = self._reverse_trans_labels(labels)
+        return labels
+    
+    @abc.abstractmethod
+    def _fit(self, train_data):
+        """Fit the model. Return True if successful"""
+        return True
+    
+    @abc.abstractmethod
+    def _predict(self, test_data):
+        """Predict on test data"""
+        return None
+    
+class GP_RBF(RegressionMethod):
+    name = 'GP_RBF'
+    
+    def _fit(self, train_data):
+        inputs, labels = train_data
+        self.model = GPy.models.GPRegression(inputs, labels,kernel=GPy.kern.RBF(inputs.shape[-1],ARD=True) +GPy.kern.Linear(inputs.shape[1], ARD=True) + GPy.kern.White(inputs.shape[1],0.01)  )
+        self.model.likelihood.variance[:] = labels.var()*0.01
+        self.model.optimize()
+        return True
+    
+    def _predict(self, test_data):
+        return self.model.predict(test_data)[0]
+
+class SVIGP_RBF(RegressionMethod):
+    name = 'SVIGP_RBF'
+    
+    def _fit(self, train_data):
+        X, Y = train_data
+        
+        Z = X[np.random.permutation(X.shape[0])[:100]]
+        k = GPy.kern.RBF(X.shape[1], ARD=True) + GPy.kern.Linear(X.shape[1], ARD=True) + GPy.kern.White(X.shape[1],0.01) 
+
+        lik = GPy.likelihoods.StudentT(deg_free=3.)
+        self.model = GPy.core.SVGP(X, Y, Z=Z, kernel=k, likelihood=lik)
+        [self.model.optimize('scg', max_iters=40, gtol=0, messages=0, xtol=0, ftol=0) for i in range(10)]
+        self.model.optimize('bfgs', max_iters=1000, gtol=0, messages=0)
+        return True
+    
+    def _predict(self, test_data):
+        return self.model.predict(test_data)[0]    
--- a/benchmarks/regression/outputs.py
+++ b/benchmarks/regression/outputs.py
@ -0,0 +1,64 @@
+# Copyright (c) 2015, Zhenwen Dai
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from __future__ import print_function
+import abc
+import os
+import numpy as np
+
+class Output(object):
+    __metaclass__ = abc.ABCMeta
+    
+    @abc.abstractmethod
+    def output(self, config, results):
+        """Return the test data: training data and labels"""
+        return None
+
+class ScreenOutput(Output):
+            
+    def output(self, config, results):
+        print('='*10+'Report'+'='*10)
+        print('\t'.join([' ']+[m.name+'('+e+')' for m in config['methods'] for e in [a.name for a in config['evaluations']]+['time']]))
+        for task_i in range(len(config['tasks'])):
+            print(config['tasks'][task_i].name+'\t', end='')
+
+            outputs = []
+            for method_i in range(len(config['methods'])):
+                for ei in range(len(config['evaluations'])+1):
+                    m,s = results[task_i, method_i, ei].mean(), results[task_i, method_i, ei].std()
+                    outputs.append('%e(%e)'%(m,s))
+            print('\t'.join(outputs))
+
+class CSVOutput(Output):
+    
+    def __init__(self, outpath, prjname):
+        self.fname = os.path.join(outpath, prjname+'.csv')
+        
+    def output(self, config, results):
+        with open(self.fname,'w') as f:
+            f.write(','.join([' ']+[m.name+'('+e+')' for m in config['methods'] for e in [a.name for a in config['evaluations']]+['time']])+'\n')
+            for task_i in range(len(config['tasks'])):
+                f.write(config['tasks'][task_i].name+',')
+
+                outputs = []
+                for method_i in range(len(config['methods'])):
+                    for ei in range(len(config['evaluations'])+1):
+                        m,s = results[task_i, method_i, ei].mean(), results[task_i, method_i, ei].std()
+                        outputs.append('%e (%e)'%(m,s))
+                f.write(','.join(outputs)+'\n')
+            f.close()
+            
+class H5Output(Output):
+    
+    def __init__(self, outpath, prjname):
+        self.fname = os.path.join(outpath, prjname+'.h5')
+        
+    def output(self, config, results):
+            try:
+                import h5py
+                f = h5py.File(self.fname,'w')
+                d = f.create_dataset('results',results.shape, dtype=results.dtype)
+                d[:] = results
+                f.close()
+            except:
+                raise 'Fails to write the parameters into a HDF5 file!'
--- a/benchmarks/regression/run.py
+++ b/benchmarks/regression/run.py
@ -0,0 +1,53 @@
+# Copyright (c) 2015, Zhenwen Dai
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from __future__ import print_function
+from evaluation import RMSE
+from methods import GP_RBF, SVIGP_RBF
+from tasks import Housing, WineQuality
+from outputs import ScreenOutput, CSVOutput, H5Output
+import numpy as np
+import time
+
+outpath = '.'
+prjname = 'regression'
+config = {
+          'evaluations':[RMSE],
+          'methods':[GP_RBF, SVIGP_RBF],
+          'tasks':[WineQuality,Housing],
+          'repeats':2,
+          'outputs': [ScreenOutput(), CSVOutput(outpath, prjname), H5Output(outpath, prjname)]
+          }
+
+if __name__=='__main__':
+    results = np.zeros((len(config['tasks']), len(config['methods']), len(config['evaluations'])+1, config['repeats']))
+    
+    for task_i in range(len(config['tasks'])):
+        dataset = config['tasks'][task_i]()
+        print('Benchmarking on '+dataset.name)
+        res = dataset.load_data()
+        if not res: print('Fail to load '+config['tasks'][task_i].name); continue
+        train = dataset.get_training_data()
+        test = dataset.get_test_data()
+        
+        for method_i in range(len(config['methods'])):
+            method = config['methods'][method_i]
+            print('With the method '+method.name, end='')
+            for ri in range(config['repeats']):
+                m = method()
+                t_st = time.time()
+                m.fit(train)
+                pred = m.predict(test[0])
+                t_pd = time.time() - t_st
+                for ei in range(len(config['evaluations'])):
+                    evalu = config['evaluations'][ei]()
+                    results[task_i, method_i, ei, ri] = evalu.evaluate(test[1], pred)
+                results[task_i, method_i, -1, ri] = t_pd
+                print('.',end='')
+            print()
+                    
+    [out.output(config, results) for out in config['outputs']]
+
+
+            
+            
--- a/benchmarks/regression/tasks.py
+++ b/benchmarks/regression/tasks.py
@ -0,0 +1,86 @@
+# Copyright (c) 2015, Zhenwen Dai
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import abc
+import os
+import numpy as np
+
+class RegressionTask(object):
+    __metaclass__ = abc.ABCMeta
+    
+    def __init__(self, datapath='./'):
+        self.datapath = datapath
+    
+    @abc.abstractmethod
+    def load_data(self):
+        """Download the dataset if not exist. Return True if successful"""
+        return True
+    
+    @abc.abstractmethod
+    def get_training_data(self):
+        """Return the training data: training data and labels"""
+        return None
+    
+    @abc.abstractmethod
+    def get_test_data(self):
+        """Return the test data: training data and labels"""
+        return None
+    
+class Housing(RegressionTask):
+    
+    name='Housing'
+    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
+    filename = 'housing.data'
+    
+    def load_data(self):
+        from GPy.util.datasets import download_url, data_path
+        if not os.path.exists(os.path.join(data_path,self.datapath, self.filename)):
+            download_url(Housing.url, self.datapath, messages=True)
+            if not os.path.exists(os.path.join(data_path, self.datapath, self.filename)):
+                return False
+        
+        data = np.loadtxt(os.path.join(data_path, self.datapath, self.filename))
+        self.data = data
+        data_train = data[:250,:-1]
+        label_train = data[:250, -1:]
+        self.train = (data_train, label_train)
+        data_test = data[250:,:-1]
+        label_test = data[250:,-1:]
+        self.test = (data_test, label_test)
+        return True
+    
+    def get_training_data(self):
+        return self.train
+    
+    def get_test_data(self):
+        return self.test
+    
+class WineQuality(RegressionTask):
+    
+    name='WineQuality'
+    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
+    filename = 'winequality-red.csv'
+    
+    def load_data(self):
+        from GPy.util.datasets import download_url, data_path
+        if not os.path.exists(os.path.join(data_path,self.datapath, self.filename)):
+            download_url(self.url, self.datapath, messages=True)
+            if not os.path.exists(os.path.join(data_path, self.datapath, self.filename)):
+                return False
+        
+        data = np.loadtxt(os.path.join(data_path, self.datapath, self.filename),skiprows=1,delimiter=';')
+        self.data = data
+        data_train = data[:1000,:-1]
+        label_train = data[:1000, -1:]
+        self.train = (data_train, label_train)
+        data_test = data[1000:,:-1]
+        label_test = data[1000:,-1:]
+        self.test = (data_test, label_test)
+        return True
+    
+    def get_training_data(self):
+        return self.train
+    
+    def get_test_data(self):
+        return self.test
+