Merge branch 'devel' of https://github.com/SheffieldML/GPy into devel

2026-05-01 07:46:22 +02:00 · 2014-05-24 15:31:50 +01:00 · 2014-05-24 15:31:50 +01:00 · e3b6d9c9c5
commit e3b6d9c9c5
parent de75d7dac1 f84f3f4308
49 changed files with 1817 additions and 867 deletions
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -671,7 +671,7 @@ def osu_run1(data_set='osu_run1', sample_every=4):
    return data_details_return({'Y': Y, 'connect' : connect}, data_set)

 def swiss_roll_generated(num_samples=1000, sigma=0.0):
-    with open(os.path.join(data_path, 'swiss_roll.pickle')) as f:
+    with open(os.path.join(os.path.dirname(__file__), 'datasets', 'swiss_roll.pickle')) as f:
        data = pickle.load(f)
    Na = data['Y'].shape[0]
    perm = np.random.permutation(np.r_[:Na])[:num_samples]
@ -723,14 +723,20 @@ def hapmap3(data_set='hapmap3'):
        import bz2
    except ImportError as i:
        raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
-    if not data_available(data_set):
-        download_data(data_set)
+
    dirpath = os.path.join(data_path,'hapmap3')
    hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly'
+    unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']]
+    unpacked_files_exist = reduce(lambda a, b:a and b, map(os.path.exists, unpacked_files))
+
+    if not unpacked_files_exist and not data_available(data_set):
+        download_data(data_set)
+
    preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \
                               ['.snps.pickle',
                                '.info.pickle',
                                '.nan.pickle']]
+
    if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)):
        if not overide_manual_authorize and not prompt_user("Preprocessing requires ~25GB "
                            "of memory and can take a (very) long time, continue? [Y/n]"):
@ -744,8 +750,7 @@ def hapmap3(data_set='hapmap3'):
                                                               perc="="*int(20.*progress/100.))
            stdout.write(status); stdout.flush()
            return status
-        unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']]
-        if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)):
+        if not unpacked_files_exist:
            status=write_status('unpacking...', 0, '')
            curr = 0
            for newfilepath in unpacked_files:
@ -762,6 +767,7 @@ def hapmap3(data_set='hapmap3'):
                            status=write_status('unpacking...', curr+12.*file_processed/(file_size), status)
                curr += 12
                status=write_status('unpacking...', curr, status)
+                os.remove(filepath)
        status=write_status('reading .ped...', 25, status)
        # Preprocess data:    
        snpstrnp = np.loadtxt(unpacked_files[0], dtype=str)
@ -832,7 +838,7 @@ def hapmap3(data_set='hapmap3'):
 def singlecell(data_set='singlecell'):
    if not data_available(data_set):
        download_data(data_set)
-    
+
    from pandas import read_csv
    dirpath = os.path.join(data_path, data_set)
    filename = os.path.join(dirpath, 'singlecell.csv')