diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py index 5febdbaf..f4862842 100644 --- a/GPy/examples/dimensionality_reduction.py +++ b/GPy/examples/dimensionality_reduction.py @@ -378,6 +378,17 @@ def stick(): return m +def robot_wireless(): + data = GPy.util.datasets.robot_wireless() + # optimize + m = GPy.models.GPLVM(data['Y'], 2) + m.optimize(messages=1, max_f_eval=10000) + m._set_params(m._get_params()) + plt.clf + ax = m.plot_latent() + + return m + def stick_bgplvm(model=None): data = GPy.util.datasets.osu_run1() Q = 6 diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py index 0776426f..c87bd57b 100644 --- a/GPy/examples/regression.py +++ b/GPy/examples/regression.py @@ -325,6 +325,27 @@ def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf): return np.array(lls) +def robot_wireless(optim_iters=100): + """Predict the location of a robot given wirelss signal strengthr readings.""" + data = GPy.util.datasets.robot_wireless() + + # create simple GP Model + m = GPy.models.GPRegression(data['Y'], data['X']) + + # optimize + m.optimize(messages=True, max_f_eval=optim_iters) + Ypredict = m.predict(data['Y'])[0] + pb.plot(data['Xtest'][:, 0], data['Xtest'][:, 1], 'r-') + pb.plot(Ypredict[:, 0], Ypredict[:, 1], 'b-') + pb.axis('equal') + pb.title('WiFi Localization with Gaussian Processes') + pb.legend(('True Location', 'Predicted Location')) + + sse = ((data['Ytest'] - Y.predict)**2).sum() + print(m) + print('Sum of squares error on test data: ', str(sse)) + return m + def sparse_GP_regression_1D(N=400, num_inducing=5, optim_iters=100): """Run a 1D example of a sparse GP regression.""" # sample inputs and outputs diff --git a/GPy/kern/parts/kernpart.py b/GPy/kern/parts/kernpart.py index 7fa54b42..edbdc091 100644 --- a/GPy/kern/parts/kernpart.py +++ b/GPy/kern/parts/kernpart.py @@ -58,3 +58,19 @@ class Kernpart(object): raise NotImplementedError def dK_dX(self, dL_dK, X, X2, target): raise NotImplementedError + +class Kernpart_inner(Kernpart): + def __init__(self,input_dim): + """ + The base class for a kernpart_inner: a positive definite function which forms part of a kernel that is based on the inner product between inputs. + + :param input_dim: the number of input dimensions to the function + :type input_dim: int + + Do not instantiate. + """ + Kernpart.__init__(self, input_dim) + + # initialize cache + self._Z, self._mu, self._S = np.empty(shape=(3, 1)) + self._X, self._X2, self._params = np.empty(shape=(3, 1)) diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index fa287939..cef9a2a9 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -23,7 +23,7 @@ def reporthook(a,b,c): # Global variables data_path = os.path.join(os.path.dirname(__file__), 'datasets') default_seed = 10000 - +overide_manual_authorize=False neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/' cmu_url = 'http://mocap.cs.cmu.edu/subjects/' # Note: there may be a better way of storing data resources. One of the pythonistas will need to take a look. @@ -33,7 +33,13 @@ data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'], 'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""", 'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""}, - + 'boston_housing' : {'urls' : ['http://archive.ics.uci.edu/ml/machine-learning-databases/housing/'], + 'files' : [['Index', 'housing.data', 'housing.names']], + 'citation' : """Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.""", + 'details' : """The Boston Housing data relates house values in Boston to a range of input variables.""", + 'license' : None, + 'size' : 51276 + }, 'brendan_faces' : {'urls' : ['http://www.cs.nyu.edu/~roweis/data/'], 'files': [['frey_rawface.mat']], 'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.', @@ -90,6 +96,12 @@ The database was created with funding from NSF EIA-0196217.""", 'citation' : """Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.""", 'license' : """Data is made available by the Delve system at the University of Toronto""", 'size' : 5861646}, + 'robot_wireless' : {'urls' : [neil_url + 'robot_wireless/'], + 'files' : [['uw-floor.txt']], + 'citation' : """WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.""", + 'details' : """Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.""", + 'license' : None, + 'size' : 284390}, 'swiss_roll' : {'urls' : ['http://isomap.stanford.edu/'], 'files' : [['swiss_roll_data.mat']], 'details' : """Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""", @@ -163,12 +175,19 @@ def authorize_download(dataset_name=None): print('') print('Data will be stored in ' + os.path.join(data_path, dataset_name) + '.') print('') - if dr['license']: - print('You must also agree to the following license:') - print(dr['license']) - print('') - print('Do you wish to proceed with the download? [yes/no]') - return prompt_user() + if overide_manual_authorize: + if dr['license']: + print('You have agreed to the following license:') + print(dr['license']) + print('') + return True + else: + if dr['license']: + print('You must also agree to the following license:') + print(dr['license']) + print('') + print('Do you wish to proceed with the download? [yes/no]') + return prompt_user() def download_data(dataset_name=None): """Check with the user that the are happy with terms and conditions for the data set, then download it.""" @@ -254,6 +273,14 @@ def sample_class(f): c = np.where(c, 1, -1) return c +def boston_housing(data_set='boston_housing'): + if not data_available(data_set): + download_data(data_set) + all_data = np.genfromtxt(os.path.join(data_path, data_set, 'housing.data')) + X = all_data[:, 0:13] + Y = all_data[:, 13:14] + return data_details_return({'X' : X, 'Y': Y}, data_set) + def brendan_faces(data_set='brendan_faces'): if not data_available(data_set): download_data(data_set) @@ -339,6 +366,47 @@ def pumadyn(seed=default_seed, data_set='pumadyn-32nm'): Ytest = data[indicesTest, -1][:, None] return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'seed': seed}, data_set) +def robot_wireless(data_set='robot_wireless'): + # WiFi access point strengths on a tour around UW Paul Allen building. + if not data_available(data_set): + download_data(data_set) + file_name = os.path.join(data_path, data_set, 'uw-floor.txt') + all_time = np.genfromtxt(file_name, usecols=(0)) + macaddress = np.genfromtxt(file_name, usecols=(1), dtype='string') + x = np.genfromtxt(file_name, usecols=(2)) + y = np.genfromtxt(file_name, usecols=(3)) + strength = np.genfromtxt(file_name, usecols=(4)) + addresses = np.unique(macaddress) + times = np.unique(all_time) + addresses.sort() + times.sort() + allY = np.zeros((len(times), len(addresses))) + allX = np.zeros((len(times), 2)) + allY[:]=-92. + strengths={} + for address, j in zip(addresses, range(len(addresses))): + ind = np.nonzero(address==macaddress) + temp_strengths=strength[ind] + temp_x=x[ind] + temp_y=y[ind] + temp_times = all_time[ind] + for time in temp_times: + vals = time==temp_times + if any(vals): + ind2 = np.nonzero(vals) + i = np.nonzero(time==times) + allY[i, j] = temp_strengths[ind2] + allX[i, 0] = temp_x[ind2] + allX[i, 1] = temp_y[ind2] + allY = (allY + 85.)/15. + + X = allX[0:215, :] + Y = allY[0:215, :] + + Xtest = allX[215:, :] + Ytest = allX[215:, :] + return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'addresses' : addresses, 'times' : times}, data_set) + def silhouette(data_set='ankur_pose_data'): # Ankur Agarwal and Bill Trigg's silhoutte data. if not data_available(data_set): @@ -468,6 +536,21 @@ def olympic_100m_men(data_set='rogers_girolami_data'): Y = olympic_data[:, 1][:, None] return data_details_return({'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m men from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}, data_set) +def olympic_100m_women(data_set='rogers_girolami_data'): + if not data_available(data_set): + download_data(data_set) + path = os.path.join(data_path, data_set) + tar_file = os.path.join(path, 'firstcoursemldata.tar.gz') + tar = tarfile.open(tar_file) + print('Extracting file.') + tar.extractall(path=path) + tar.close() + olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['female100'] + + X = olympic_data[:, 0][:, None] + Y = olympic_data[:, 1][:, None] + return data_details_return({'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m women from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}, data_set) + def olympic_marathon_men(data_set='olympic_marathon_men'): if not data_available(data_set): download_data(data_set) @@ -543,7 +626,6 @@ def crescent_data(num_data=200, seed=default_seed): num_data_total += num_data_part[i] X = np.vstack((Xparts[0], Xparts[1], Xparts[2], Xparts[3])) - Y = np.vstack((np.ones((num_data_part[0] + num_data_part[1], 1)), -np.ones((num_data_part[2] + num_data_part[3], 1)))) return {'X':X, 'Y':Y, 'info': "Two separate classes of data formed approximately in the shape of two crescents."}