Added ability to load in cmu motion capture data bases in the new data base loading format.

2026-05-01 07:46:22 +02:00 · 2013-08-18 08:18:27 +02:00 · 2013-08-18 08:18:27 +02:00 · c45a80499c
commit c45a80499c
parent 791f499412
14 changed files with 361 additions and 1736 deletions
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -5,13 +5,113 @@ import GPy
 import scipy.sparse
 import scipy.io
 import cPickle as pickle
-import urllib2 as url
+import urllib as url
+import zipfile
+import tarfile
+import gzip
+import zlib

+import sys, urllib
+def reporthook(a,b,c): 
+    # ',' at the end of the line is important!
+    print "% 3.1f%% of %d bytes\r" % (min(100, float(a * b) / c * 100), c),
+    #you can also use sys.stdout.write
+    #sys.stdout.write("\r% 3.1f%% of %d bytes" 
+    #                 % (min(100, float(a * b) / c * 100), c)
+    sys.stdout.flush()
+     
+# Global variables
 data_path = os.path.join(os.path.dirname(__file__), 'datasets')
 default_seed = 10000
-neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/'

+neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/'
+cmu_url = 'http://mocap.cs.cmu.edu/subjects/'
+# Note: there may be a better way of storing data resources. One of the pythonistas will need to take a look.
+data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
+                                       'files' : [['ankurDataPoseSilhouette.mat']],
+                                       'license' : None,
+                                       'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""",
+                                       'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""},
+                   
+                  
+                  'brendan_faces' : {'urls' : ['http://www.cs.nyu.edu/~roweis/data/'],
+                                     'files': [['frey_rawface.mat']],
+                                     'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.',
+                                     'details' : """A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.""",
+                                     'license': None,
+                                     'size' : 1100584},
+                  'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'],
+                                 'files' : [['allasfamc.zip']],
+                                 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.
+The database was created with funding from NSF EIA-0196217.""",
+                                 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""",
+                                 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""",
+                                 'size' : None},
+                  'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'],
+                                     'files' : [['creeprupt.tar']],
+                                     'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.',
+                                     'details' : """Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.""",
+                                     'license' : None,
+                                     'size' : 602797},
+                  'della_gatta' : {'urls' : [neil_url + 'della_gatta/'],
+                                   'files': [['DellaGattadata.mat']],
+                                   'citation' : 'Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008',
+                                   'details': "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.",
+                                   'license':None,
+                                   'size':3729650},
+                  'three_phase_oil_flow': {'urls' : [neil_url + 'three_phase_oil_flow/'],
+                                           'files' : [['DataTrnLbls.txt', 'DataTrn.txt', 'DataTst.txt', 'DataTstLbls.txt', 'DataVdn.txt', 'DataVdnLbls.txt']],
+                                           'citation' : 'Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593',
+                                           'details' : """The three phase oil data used initially for demonstrating the Generative Topographic mapping.""",
+                                           'license' : None,
+                                           'size' : 712796},
+                  'rogers_girolami_data' : {'urls' : ['https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/'],
+                                            'files' : [['firstcoursemldata.tar.gz']],
+                                            'suffices' : [['?dl=1']],
+                                            'citation' : 'A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146',
+                                            'details' : """Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.""",
+                                            'license' : None,
+                                            'size' : 21949154},
+                  'olympic_marathon_men' : {'urls' : [neil_url + 'olympic_marathon_men/'],
+                                            'files' : [['olympicMarathonTimes.csv']],
+                                            'citation' : None,
+                                            'details' : """Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data""",
+                                            'license': None,
+                                            'size' : 584},
+                  'osu_run1' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'],
+                                'files': [['sprintTXT.ZIP'],['connections.txt']],
+                                'details' : "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.",
+                                'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.',
+                                'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).',
+                                'size': 338103},
+                  'pumadyn-32nm' : {'urls' : ['ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/'],
+                                    'files' : [['pumadyn-32nm.tar.gz']],
+                                    'details' : """Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.""",
+                                    'citation' : """Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.""",
+                                    'license' : """Data is made available by the Delve system at the University of Toronto""",
+                                    'size' : 5861646},
+                  'swiss_roll' : {'urls' : ['http://isomap.stanford.edu/'],
+                                  'files' : [['swiss_roll_data.mat']],
+                                  'details' : """Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
+                                  'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
+                                  'license' : None,
+                                  'size' : 800256},
+                  'ripley_prnn_data' : {'urls' : ['http://www.stats.ox.ac.uk/pub/PRNN/'],
+                                        'files' : [['Cushings.dat', 'README', 'crabs.dat', 'fglass.dat', 'fglass.grp', 'pima.te', 'pima.tr', 'pima.tr2', 'synth.te', 'synth.tr', 'viruses.dat', 'virus3.dat']],
+                                        'details' : """Data sets from Brian Ripley's Pattern Recognition and Neural Networks""",
+                                        'citation': """Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7""",
+                                        'license' : None,
+                                        'size' : 93565},
+                  'isomap_face_data' : {'urls' : [neil_url + 'isomap_face_data/'],
+                                        'files' : [['face_data.mat']],
+                                        'details' : """Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
+                                        'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
+                                        'license' : None,
+                                        'size' : 24229368},
+                  }
+                  
 def prompt_user():
+    """Ask user for agreeing to data set licenses."""
    # raw_input returns the empty string for "enter"
    yes = set(['yes', 'y'])
    no = set(['no','n'])
@ -25,45 +125,127 @@ def prompt_user():
        sys.stdout.write("Please respond with 'yes', 'y' or 'no', 'n'")
        return prompt_user()

-def download_data(dataset_name=None):
-    """Helper function which contains the resource locations for each data set in one place"""
-
-    # Note: there may be a better way of doing this. One of the pythonistas will need to take a look. Neil
-    data_resources = {'oil': {'urls' : [neil_url + 'oil_data/'],
-                              'files' : [['DataTrnLbls.txt', 'DataTrn.txt']],
-                              'citation' : 'Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593',
-                              'details' : """The three phase oil data used initially for demonstrating the Generative Topographic mapping.""",
-                              'agreement' : None},
-                      'brendan_faces' : {'url' : ['http://www.cs.nyu.edu/~roweis/data/'],
-                                         'files': [['frey_rawface.mat']],
-                                         'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.',
-                                         'details' : """A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.""",
-                                         'agreement': None}
-                      }
-
+def data_available(dataset_name=None):
+    """Check if the data set is available on the local machine already."""
+    for file_list in data_resources[dataset_name]['files']:
+        for file in file_list:
+            if not os.path.exists(os.path.join(data_path, dataset_name, file)):
+                return False
+    return True
+            
+def download_url(url, store_directory, save_name = None, messages = True, suffix=''):
+    """Download a file from a url and save it to disk."""
+    i = url.rfind('/')
+    file = url[i+1:]
+    print file
+    dir_name = os.path.join(data_path, store_directory)
+    save_name = os.path.join(dir_name, file)
+    print "Downloading ", url, "->", os.path.join(store_directory, file)
+    if not os.path.exists(dir_name):
+        os.makedirs(dir_name)
+    urllib.urlretrieve(url+suffix, save_name, reporthook)

+def authorize_download(dataset_name=None):
+    """Check with the user that the are happy with terms and conditions for the data set."""
    print('Acquiring resource: ' + dataset_name)
    # TODO, check resource is in dictionary!
+    print('')
    dr = data_resources[dataset_name]
    print('Details of data: ')
    print(dr['details'])
+    print('')
    if dr['citation']:
        print('Please cite:')
        print(dr['citation'])
-    if dr['agreement']:
-        print('You must also agree to the following:')
-        print(dr['agreement'])
+        print('')
+    if dr['size']:
+        print('After downloading the data will take up ' + str(dr['size']) + ' bytes of space.')
+        print('')
+    print('Data will be stored in ' + os.path.join(data_path, dataset_name) + '.')
+    print('')
+    if dr['license']:
+        print('You must also agree to the following license:')
+        print(dr['license'])
+        print('')
    print('Do you wish to proceed with the download? [yes/no]')
-    if prompt_user()==False:
+    return prompt_user()
+
+def download_data(dataset_name=None):
+    """Check with the user that the are happy with terms and conditions for the data set, then download it."""
+
+    dr = data_resources[dataset_name]
+    if not authorize_download(dataset_name):
        return False

-    for url, files in zip(dr['urls'], dr['files']):
-        for file in files:
-            download_resource(url + file)
+    if dr.has_key('suffices'):
+        for url, files, suffices in zip(dr['urls'], dr['files'], dr['suffices']):
+            for file, suffix in zip(files, suffices):
+                download_url(os.path.join(url,file), dataset_name, dataset_name, suffix=suffix)
+    else:
+        for url, files in zip(dr['urls'], dr['files']):
+            for file in files:
+                download_url(os.path.join(url,file), dataset_name, dataset_name)
    return True
                  
+def data_details_return(data, data_set):
+    """Update the data component of the data dictionary with details drawn from the data_resources."""
+    data.update(data_resources[data_set])
+    return data

+    
+def cmu_urls_files(subj_motions, messages = True):
+    '''
+    Find which resources are missing on the local disk for the requested CMU motion capture motions. 
+    '''
+    
+    subjects_num = subj_motions[0]
+    motions_num = subj_motions[1]
+
+    resource = {'urls' : [], 'files' : []}
+    # Convert numbers to strings
+    subjects = []
+    motions = [list() for _ in range(len(subjects_num))]
+    for i in range(len(subjects_num)):
+        curSubj = str(int(subjects_num[i]))
+        if int(subjects_num[i]) < 10:
+            curSubj = '0' + curSubj
+        subjects.append(curSubj)
+        for j in range(len(motions_num[i])):
+            curMot = str(int(motions_num[i][j]))
+            if int(motions_num[i][j]) < 10:
+                curMot = '0' + curMot
+            motions[i].append(curMot)
+
+    all_skels = []
+    
+    assert len(subjects) == len(motions)
+    
+    all_motions = []
+            
+    for i in range(len(subjects)):
+        skel_dir = os.path.join(data_path, 'cmu_mocap')
+        cur_skel_file = os.path.join(skel_dir, subjects[i] + '.asf')
        
+        url_required = False
+        file_download = []
+        if not os.path.exists(cur_skel_file):
+            # Current skel file doesn't exist.
+            if not os.path.isdir(skel_dir):
+                os.mkdir(skel_dir)
+            # Add skel file to list.
+            url_required = True
+            file_download.append(subjects[i] + '.asf')
+        for j in range(len(motions[i])):
+            file_name = subjects[i] + '_' + motions[i][j] + '.amc'
+            cur_motion_file = os.path.join(skel_dir, file_name)
+            if not os.path.exists(cur_motion_file):
+                url_required = True
+                file_download.append(subjects[i] + '_' + motions[i][j] + '.amc')
+        if url_required:
+            resource['urls'].append(cmu_url + subjects[i] + '/')
+            resource['files'].append(file_download)
+    return resource
+

 # Some general utilities.
 def sample_class(f):
@ -72,25 +254,17 @@ def sample_class(f):
    c = np.where(c, 1, -1)
    return c

-def download_resource(resource, save_name = None, save_file = True, messages = True):
-    if messages:
-        print "Downloading resource: " , resource, " ... ",
-    response = url.urlopen(resource)
-    # TODO: Some error checking...
-    # ...
-    html = response.read()
-    response.close()
-    if save_file:
-        # TODO: Check if already exists...
-        # ...
-        with open(save_name, "w") as text_file:
-            text_file.write("%s"%html)
-            if messages:
-                print "Done!"
-    return html
+def brendan_faces(data_set='brendan_faces'):
+    if not data_available(data_set):
+        download_data(data_set)
+    mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'frey_rawface.mat'))
+    Y = mat_data['ff'].T
+    return data_details_return({'Y': Y}, data_set)

-def della_gatta_TRP63_gene_expression(gene_number=None):
-    mat_data = scipy.io.loadmat(os.path.join(data_path, 'DellaGattadata.mat'))
+def della_gatta_TRP63_gene_expression(data_set='della_gatta', gene_number=None):
+    if not data_available(data_set):
+        download_data(data_set)
+    mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'DellaGattadata.mat'))
    X = np.double(mat_data['timepoints'])
    if gene_number == None:
        Y = mat_data['exprs_tp53_RMA']
@ -98,45 +272,62 @@ def della_gatta_TRP63_gene_expression(gene_number=None):
        Y = mat_data['exprs_tp53_RMA'][:, gene_number]
        if len(Y.shape) == 1:
            Y = Y[:, None]
-    return {'X': X, 'Y': Y, 'info': "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA."}
+    return data_details_return({'X': X, 'Y': Y, 'gene_number' : gene_number}, data_set)

-def simulation_BGPLVM():
-    mat_data = scipy.io.loadmat(os.path.join(data_path, 'BGPLVMSimulation.mat'))
-    Y = np.array(mat_data['Y'], dtype=float)
-    S = np.array(mat_data['initS'], dtype=float)
-    mu = np.array(mat_data['initMu'], dtype=float)
-    return {'Y': Y, 'S': S,
-            'mu' : mu,
-            'info': "Simulated test dataset generated in MATLAB to compare BGPLVM between python and MATLAB"}


 # The data sets
-def oil():
-    #if download_data('oil'):
-    oil_train_file = os.path.join(data_path, 'oil', 'DataTrn.txt')
-    oil_trainlbls_file = os.path.join(data_path, 'oil', 'DataTrnLbls.txt')
+def oil(data_set='three_phase_oil_flow'):
+    """The three phase oil data from Bishop and James (1993)."""
+    if not data_available(data_set):
+        download_data(data_set)
+    oil_train_file = os.path.join(data_path, data_set, 'DataTrn.txt')
+    oil_trainlbls_file = os.path.join(data_path, data_set, 'DataTrnLbls.txt')
+    oil_test_file = os.path.join(data_path, data_set, 'DataTst.txt')
+    oil_testlbls_file = os.path.join(data_path, data_set, 'DataTstLbls.txt')
+    oil_valid_file = os.path.join(data_path, data_set, 'DataVdn.txt')
+    oil_validlbls_file = os.path.join(data_path, data_set, 'DataVdnLbls.txt')
    fid = open(oil_train_file)
    X = np.fromfile(fid, sep='\t').reshape((-1, 12))
    fid.close()
+    fid = open(oil_test_file)
+    Xtest = np.fromfile(fid, sep='\t').reshape((-1, 12))
+    fid.close()
+    fid = open(oil_valid_file)
+    Xvalid = np.fromfile(fid, sep='\t').reshape((-1, 12))
+    fid.close()
    fid = open(oil_trainlbls_file)
    Y = np.fromfile(fid, sep='\t').reshape((-1, 3)) * 2. - 1.
    fid.close()
-    return {'X': X, 'Y': Y, 'info': "The oil data from Bishop and James (1993)."}
+    fid = open(oil_testlbls_file)
+    Ytest = np.fromfile(fid, sep='\t').reshape((-1, 3)) * 2. - 1.
+    fid.close()
+    fid = open(oil_validlbls_file)
+    Yvalid = np.fromfile(fid, sep='\t').reshape((-1, 3)) * 2. - 1.
+    fid.close()
+    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'Xtest' : Xtest, 'Xvalid': Xvalid, 'Yvalid': Yvalid}, data_set)
    #else:
    # throw an error
    
-def oil_100(seed=default_seed):
+def oil_100(seed=default_seed, data_set = 'three_phase_oil_flow'):
    np.random.seed(seed=seed)
    data = oil()
    indices = np.random.permutation(1000)
    indices = indices[0:100]
    X = data['X'][indices, :]
    Y = data['Y'][indices, :]
-    return {'X': X, 'Y': Y, 'info': "Subsample of the oil data extracting 100 values randomly without replacement."}
+    return data_details_return({'X': X, 'Y': Y, 'info': "Subsample of the full oil data extracting 100 values randomly without replacement, here seed was " + str(seed)}, data_set)

-def pumadyn(seed=default_seed):
+def pumadyn(seed=default_seed, data_set='pumadyn-32nm'):
+    if not data_available(data_set):
+        download_data(data_set)
+        path = os.path.join(data_path, data_set)
+        tar = tarfile.open(os.path.join(path, 'pumadyn-32nm.tar.gz'))
+        print('Extracting file.')
+        tar.extractall(path=path)
+        tar.close()
    # Data is variance 1, no need to normalize.
-    data = np.loadtxt(os.path.join(data_path, 'pumadyn-32nm/Dataset.data.gz'))
+    data = np.loadtxt(os.path.join(data_path, data_set, 'pumadyn-32nm', 'Dataset.data.gz'))
    indices = np.random.permutation(data.shape[0])
    indicesTrain = indices[0:7168]
    indicesTest = indices[7168:-1]
@ -146,20 +337,13 @@ def pumadyn(seed=default_seed):
    Y = data[indicesTrain, -1][:, None]
    Xtest = data[indicesTest, 0:-2]
    Ytest = data[indicesTest, -1][:, None]
-    return {'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "The puma robot arm data with 32 inputs. This data is the non linear case with medium noise (pumadyn-32nm). For training 7,168 examples are sampled without replacement."}
+    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'seed': seed}, data_set)

-
-def brendan_faces():
-    mat_data = scipy.io.loadmat(os.path.join(data_path, 'frey_rawface.mat'))
-    Y = mat_data['ff'].T
-    return {'Y': Y, 'info': "Face data made available by Brendan Frey"}
-
-
-
-
-def silhouette():
+def silhouette(data_set='ankur_pose_data'):
    # Ankur Agarwal and Bill Trigg's silhoutte data.
-    mat_data = scipy.io.loadmat(os.path.join(data_path, 'mocap', 'ankur', 'ankurDataPoseSilhouette.mat'))
+    if not data_available(data_set):
+        download_data(data_set)
+    mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'ankurDataPoseSilhouette.mat'))
    inMean = np.mean(mat_data['Y'])
    inScales = np.sqrt(np.var(mat_data['Y']))
    X = mat_data['Y'] - inMean
@ -168,22 +352,35 @@ def silhouette():
    Xtest = Xtest / inScales
    Y = mat_data['Z']
    Ytest = mat_data['Z_test']
-    return {'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "Artificial silhouette simulation data developed from Agarwal and Triggs (2004)."}
+    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest}, data_set)

-def stick():
-    #if download_data('stick'):
-    Y, connect = GPy.util.mocap.load_text_data('run1', data_path)
+def ripley_synth(data_set='ripley_prnn_data'):
+    if not data_available(data_set):
+        download_data(data_set)
+    train = np.genfromtxt(os.path.join(data_path, data_set, 'synth.tr'), skip_header=1)
+    X = train[:, 0:2]
+    y = train[:, 2:3]
+    test = np.genfromtxt(os.path.join(data_path, data_set, 'synth.te'), skip_header=1)
+    Xtest = test[:, 0:2]
+    ytest = test[:, 2:3]
+    return data_details_return({'X': X, 'y': y, 'Xtest': Xtest, 'ytest': ytest, 'info': 'Synthetic data generated by Ripley for a two class classification problem.'}, data_set)
+
+def osu_run1(data_set='osu_run1'):
+    if not data_available(data_set):
+        download_data(data_set)
+    zip = zipfile.ZipFile(os.path.join(data_path, data_set, 'sprintTXT.ZIP'), 'r')
+    path = os.path.join(data_path, data_set)
+    for name in zip.namelist():
+        zip.extract(name, path)
+    Y, connect = GPy.util.mocap.load_text_data('Aug210107', path)
    Y = Y[0:-1:4, :]
-    lbls = 'connect'
-    return {'Y': Y, 'connect' : connect, 'info': "Stick man data from Ohio."}
-    # else:
-    # throw an error.
+    return data_details_return({'Y': Y, 'connect' : connect}, data_set)

-def swiss_roll_generated(N=1000, sigma=0.0):
+def swiss_roll_generated(num_samples=1000, sigma=0.0):
    with open(os.path.join(data_path, 'swiss_roll.pickle')) as f:
        data = pickle.load(f)
    Na = data['Y'].shape[0]
-    perm = np.random.permutation(np.r_[:Na])[:N]
+    perm = np.random.permutation(np.r_[:Na])[:num_samples]
    Y = data['Y'][perm, :]
    t = data['t'][perm]
    c = data['colors'][perm, :]
@ -194,17 +391,34 @@ def swiss_roll_generated(N=1000, sigma=0.0):
    return {'Y':Y, 't':t, 'colors':c}

 def swiss_roll_1000():
-    mat_data = scipy.io.loadmat(os.path.join(data_path, 'swiss_roll_data'))
-    Y = mat_data['X_data'][:, 0:1000].transpose()
-    return {'Y': Y, 'info': "Subsample of the swiss roll data extracting only the first 1000 values."}
+    return swiss_roll(num_samples=1000)

-def swiss_roll(N=3000):
-    mat_data = scipy.io.loadmat(os.path.join(data_path, 'swiss_roll_data.mat'))
-    Y = mat_data['X_data'][:, 0:N].transpose()
-    return {'Y': Y, 'X': mat_data['X_data'], 'info': "The first 3,000 points from the swiss roll data of Tennenbaum, de Silva and Langford (2001)."}
+def swiss_roll(num_samples=3000, data_set='swiss_roll'):
+    if not data_available(data_set):
+        download_data(data_set)
+    mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'swiss_roll_data.mat'))
+    Y = mat_data['X_data'][:, 0:num_samples].transpose()
+    return data_details_return({'Y': Y, 'X': mat_data['X_data'], 'info': "The first " + str(num_samples) + " points from the swiss roll data of Tennenbaum, de Silva and Langford (2001)."}, data_set)
+
+def isomap_faces(num_samples=698, data_set='isomap_face_data'):
+    if not data_available(data_set):
+        download_data(data_set)
+    mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'face_data.mat'))
+    Y = mat_data['images'][:, 0:num_samples].transpose()
+    return data_details_return({'Y': Y, 'poses' : mat_data['poses'], 'lights': mat_data['lights'], 'info': "The first " + str(num_samples) + " points from the face data of Tennenbaum, de Silva and Langford (2001)."}, data_set)
+
+def simulation_BGPLVM():
+    mat_data = scipy.io.loadmat(os.path.join(data_path, 'BGPLVMSimulation.mat'))
+    Y = np.array(mat_data['Y'], dtype=float)
+    S = np.array(mat_data['initS'], dtype=float)
+    mu = np.array(mat_data['initMu'], dtype=float)
+    return data_details_return({'S': S, 'Y': Y, 'mu': mu}, data_set)
+    return {'Y': Y, 'S': S,
+            'mu' : mu,
+            'info': "Simulated test dataset generated in MATLAB to compare BGPLVM between python and MATLAB"}

 def toy_rbf_1d(seed=default_seed, num_samples=500):
-    """Samples 500 values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1.
+    """Samples values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1.
    :param seed: seed to use for random sampling.
    :type seed: int
    :param num_samples: number of samples to sample in the function (default 500).
@ -219,7 +433,7 @@ def toy_rbf_1d(seed=default_seed, num_samples=500):
    kernel = rbf + white
    K = kernel.K(X)
    y = np.reshape(np.random.multivariate_normal(np.zeros(num_samples), K), (num_samples, 1))
-    return {'X':X, 'Y':y, 'info': "Samples 500 values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1."}
+    return {'X':X, 'Y':y, 'info': "Sampled " + str(num_samples) + " values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1."}

 def toy_rbf_1d_50(seed=default_seed):
    np.random.seed(seed=seed)
@ -229,7 +443,7 @@ def toy_rbf_1d_50(seed=default_seed):
    indices.sort(axis=0)
    X = data['X'][indices, :]
    Y = data['Y'][indices, :]
-    return {'X': X, 'Y': Y, 'info': "Subsamples the toy_rbf_sample with 50 values randomly taken from the original sample."}
+    return {'X': X, 'Y': Y, 'info': "Subsamples the toy_rbf_sample with 50 values randomly taken from the original sample.", 'seed' : seed}


 def toy_linear_1d_classification(seed=default_seed):
@ -237,13 +451,31 @@ def toy_linear_1d_classification(seed=default_seed):
    x1 = np.random.normal(-3, 5, 20)
    x2 = np.random.normal(3, 5, 20)
    X = (np.r_[x1, x2])[:, None]
-    return {'X': X, 'Y':  sample_class(2.*X), 'F': 2.*X}
+    return {'X': X, 'Y':  sample_class(2.*X), 'F': 2.*X, 'seed' : seed}
+
+def olympic_100m_men(data_set='rogers_girolami_data'):
+    if not data_available(data_set):
+        download_data(data_set)
+        path = os.path.join(data_path, data_set)
+        tar_file = os.path.join(path, 'firstcoursemldata.tar.gz')
+        tar = tarfile.open(tar_file)
+        print('Extracting file.')
+        tar.extractall(path=path)
+        tar.close()
+    olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['male100']

-def rogers_girolami_olympics():
-    olympic_data = scipy.io.loadmat(os.path.join(data_path, 'olympics.mat'))['male100']
    X = olympic_data[:, 0][:, None]
    Y = olympic_data[:, 1][:, None]
-    return {'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m men from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}
+    return data_details_return({'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m men from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}, data_set)
+
+def olympic_marathon_men(data_set='olympic_marathon_men'):
+    if not data_available(data_set):
+        download_data(data_set)
+    olympics = np.genfromtxt(os.path.join(data_path, data_set, 'olympicMarathonTimes.csv'), delimiter=',')
+    X = olympics[:, 0:1]
+    Y = olympics[:, 1:2]
+    return data_details_return({'X': X, 'Y': Y}, data_set)
+
 # def movielens_small(partNo=1,seed=default_seed):
 #     np.random.seed(seed=seed)

@ -277,8 +509,6 @@ def rogers_girolami_olympics():
 #     return {'Y':Y, 'lbls':lbls, 'Ytest':Ytest, 'lblstest':lblstest}


-
-
 def crescent_data(num_data=200, seed=default_seed):
    """Data set formed from a mixture of four Gaussians. In each class two of the Gaussians are elongated at right angles to each other and offset to form an approximation to the crescent data that is popular in semi-supervised learning as a toy problem.
    :param num_data_part: number of data to be sampled (default is 200).
@ -307,7 +537,6 @@ def crescent_data(num_data=200, seed=default_seed):
    for i in range(0, 4):
        num_data_part.append(round(((i + 1) * num_data) / 4.))
        num_data_part[i] -= num_data_total
-        # print num_data_part[i]
        part = np.random.normal(size=(num_data_part[i], 2))
        part = np.dot(np.dot(part, scales[i]), R) + means[i]
        Xparts.append(part)
@ -318,13 +547,22 @@ def crescent_data(num_data=200, seed=default_seed):
    Y = np.vstack((np.ones((num_data_part[0] + num_data_part[1], 1)), -np.ones((num_data_part[2] + num_data_part[3], 1))))
    return {'X':X, 'Y':Y, 'info': "Two separate classes of data formed approximately in the shape of two crescents."}

-def creep_data():
-    all_data = np.loadtxt(os.path.join(data_path, 'creep', 'taka'))
+def creep_data(data_set='creep_rupture'):
+    """Brun and Yoshida's metal creep rupture data."""
+    if not data_available(data_set):
+        download_data(data_set)
+        path = os.path.join(data_path, data_set)
+        tar_file = os.path.join(path, 'creeprupt.tar')
+        tar = tarfile.open(tar_file)
+        print('Extracting file.')
+        tar.extractall(path=path)
+        tar.close()
+    all_data = np.loadtxt(os.path.join(data_path, data_set, 'taka'))
    y = all_data[:, 1:2].copy()
    features = [0]
    features.extend(range(2, 31))
    X = all_data[:, features].copy()
-    return {'X': X, 'y' : y}
+    return data_details_return({'X': X, 'y': y}, data_set)

 def cmu_mocap_49_balance():
    """Load CMU subject 49's one legged balancing motion that was used by Alvarez, Luengo and Lawrence at AISTATS 2009."""
@ -346,14 +584,19 @@ def cmu_mocap_35_walk_jog():
    data['info'] = "Walk and jog data from CMU data base subject 35. As used in Tayor, Roweis and Hinton at NIPS 2007, but without their pre-processing (i.e. as used by Lawrence at AISTATS 2007). It consists of " + data['info']
    return data

-def cmu_mocap(subject, train_motions, test_motions=[], sample_every=4):
+def cmu_mocap(subject, train_motions, test_motions=[], sample_every=4, data_set='cmu_mocap'):
    """Load a given subject's training and test motions from the CMU motion capture data."""
-
    # Load in subject skeleton.
-    subject_dir = os.path.join(data_path, 'mocap', 'cmu', subject)
+    subject_dir = os.path.join(data_path, data_set)

    # Make sure the data is downloaded.
-    mocap.fetch_cmu(([subject], [train_motions]), skel_store_dir=subject_dir,motion_store_dir=subject_dir)
+    all_motions = train_motions + test_motions
+    resource = cmu_urls_files(([subject], [all_motions]))
+    data_resources[data_set] = data_resources['cmu_mocap_full']
+    data_resources[data_set]['files'] = resource['files']
+    data_resources[data_set]['urls'] = resource['urls']
+    if resource['urls']:
+        download_data(data_set)

    skel = GPy.util.mocap.acclaim_skeleton(os.path.join(subject_dir, subject + '.asf'))

@ -418,4 +661,4 @@ def cmu_mocap(subject, train_motions, test_motions=[], sample_every=4):
        info += '.'
    if sample_every != 1:
        info += ' Data is sub-sampled to every ' + str(sample_every) + ' frames.'
-    return {'Y': Y, 'lbls' : lbls, 'Ytest': Ytest, 'lblstest' : lblstest, 'info': info, 'skel': skel}
+    return data_details_return({'Y': Y, 'lbls' : lbls, 'Ytest': Ytest, 'lblstest' : lblstest, 'info': info, 'skel': skel}, data_set)