Added robot_wireless data set and examples.

2026-05-03 08:42:39 +02:00 · 2013-08-18 23:46:02 +02:00 · 2013-08-18 23:46:02 +02:00 · 4082f6c02e
commit 4082f6c02e
parent c45a80499c
4 changed files with 139 additions and 9 deletions
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -23,7 +23,7 @@ def reporthook(a,b,c):
 # Global variables
 data_path = os.path.join(os.path.dirname(__file__), 'datasets')
 default_seed = 10000
-
+overide_manual_authorize=False
 neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/'
 cmu_url = 'http://mocap.cs.cmu.edu/subjects/'
 # Note: there may be a better way of storing data resources. One of the pythonistas will need to take a look.
@ -33,7 +33,13 @@ data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
                                       'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""",
                                       'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""},
                   
-                  
+                  'boston_housing' : {'urls' : ['http://archive.ics.uci.edu/ml/machine-learning-databases/housing/'],
+                                      'files' : [['Index', 'housing.data', 'housing.names']],
+                                      'citation' : """Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.""",
+                                      'details' : """The Boston Housing data relates house values in Boston to a range of input variables.""",
+                                      'license' : None,
+                                      'size' : 51276
+                                      },
                  'brendan_faces' : {'urls' : ['http://www.cs.nyu.edu/~roweis/data/'],
                                     'files': [['frey_rawface.mat']],
                                     'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.',
@ -90,6 +96,12 @@ The database was created with funding from NSF EIA-0196217.""",
                                    'citation' : """Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.""",
                                    'license' : """Data is made available by the Delve system at the University of Toronto""",
                                    'size' : 5861646},
+                  'robot_wireless' : {'urls' : [neil_url + 'robot_wireless/'],
+                                      'files' : [['uw-floor.txt']],
+                                      'citation' : """WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.""",
+                                      'details' : """Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.""",
+                                      'license' : None,
+                                      'size' : 284390},
                  'swiss_roll' : {'urls' : ['http://isomap.stanford.edu/'],
                                  'files' : [['swiss_roll_data.mat']],
                                  'details' : """Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
@ -163,12 +175,19 @@ def authorize_download(dataset_name=None):
        print('')
    print('Data will be stored in ' + os.path.join(data_path, dataset_name) + '.')
    print('')
-    if dr['license']:
-        print('You must also agree to the following license:')
-        print(dr['license'])
-        print('')
-    print('Do you wish to proceed with the download? [yes/no]')
-    return prompt_user()
+    if overide_manual_authorize:
+        if dr['license']:
+            print('You have agreed to the following license:')
+            print(dr['license'])
+            print('')
+        return True
+    else:
+        if dr['license']:
+            print('You must also agree to the following license:')
+            print(dr['license'])
+            print('')
+        print('Do you wish to proceed with the download? [yes/no]')
+        return prompt_user()

 def download_data(dataset_name=None):
    """Check with the user that the are happy with terms and conditions for the data set, then download it."""
@ -254,6 +273,14 @@ def sample_class(f):
    c = np.where(c, 1, -1)
    return c

+def boston_housing(data_set='boston_housing'):
+    if not data_available(data_set):
+        download_data(data_set)
+    all_data = np.genfromtxt(os.path.join(data_path, data_set, 'housing.data'))
+    X = all_data[:, 0:13]
+    Y = all_data[:, 13:14]
+    return data_details_return({'X' : X, 'Y': Y}, data_set)
+
 def brendan_faces(data_set='brendan_faces'):
    if not data_available(data_set):
        download_data(data_set)
@ -339,6 +366,47 @@ def pumadyn(seed=default_seed, data_set='pumadyn-32nm'):
    Ytest = data[indicesTest, -1][:, None]
    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'seed': seed}, data_set)

+def robot_wireless(data_set='robot_wireless'):
+    # WiFi access point strengths on a tour around UW Paul Allen building.
+    if not data_available(data_set):
+        download_data(data_set)
+    file_name = os.path.join(data_path, data_set, 'uw-floor.txt')
+    all_time = np.genfromtxt(file_name, usecols=(0))
+    macaddress = np.genfromtxt(file_name, usecols=(1), dtype='string')
+    x = np.genfromtxt(file_name, usecols=(2))
+    y = np.genfromtxt(file_name, usecols=(3))
+    strength = np.genfromtxt(file_name, usecols=(4))
+    addresses = np.unique(macaddress)
+    times = np.unique(all_time)
+    addresses.sort()
+    times.sort()
+    allY = np.zeros((len(times), len(addresses)))
+    allX = np.zeros((len(times), 2))
+    allY[:]=-92.
+    strengths={}
+    for address, j in zip(addresses, range(len(addresses))):
+        ind = np.nonzero(address==macaddress)
+        temp_strengths=strength[ind]
+        temp_x=x[ind]
+        temp_y=y[ind]
+        temp_times = all_time[ind]
+        for time in temp_times:
+            vals = time==temp_times
+            if any(vals):
+                ind2 = np.nonzero(vals)
+                i = np.nonzero(time==times)
+                allY[i, j] = temp_strengths[ind2]
+                allX[i, 0] = temp_x[ind2]
+                allX[i, 1] = temp_y[ind2]
+    allY = (allY + 85.)/15.
+
+    X = allX[0:215, :]
+    Y = allY[0:215, :]
+
+    Xtest = allX[215:, :]
+    Ytest = allX[215:, :]
+    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'addresses' : addresses, 'times' : times}, data_set)
+
 def silhouette(data_set='ankur_pose_data'):
    # Ankur Agarwal and Bill Trigg's silhoutte data.
    if not data_available(data_set):
@ -468,6 +536,21 @@ def olympic_100m_men(data_set='rogers_girolami_data'):
    Y = olympic_data[:, 1][:, None]
    return data_details_return({'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m men from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}, data_set)

+def olympic_100m_women(data_set='rogers_girolami_data'):
+    if not data_available(data_set):
+        download_data(data_set)
+        path = os.path.join(data_path, data_set)
+        tar_file = os.path.join(path, 'firstcoursemldata.tar.gz')
+        tar = tarfile.open(tar_file)
+        print('Extracting file.')
+        tar.extractall(path=path)
+        tar.close()
+    olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['female100']
+
+    X = olympic_data[:, 0][:, None]
+    Y = olympic_data[:, 1][:, None]
+    return data_details_return({'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m women from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}, data_set)
+
 def olympic_marathon_men(data_set='olympic_marathon_men'):
    if not data_available(data_set):
        download_data(data_set)
@ -543,7 +626,6 @@ def crescent_data(num_data=200, seed=default_seed):
        num_data_total += num_data_part[i]
    X = np.vstack((Xparts[0], Xparts[1], Xparts[2], Xparts[3]))

-
    Y = np.vstack((np.ones((num_data_part[0] + num_data_part[1], 1)), -np.ones((num_data_part[2] + num_data_part[3], 1))))
    return {'X':X, 'Y':Y, 'info': "Two separate classes of data formed approximately in the shape of two crescents."}