HapMap3 dataset added

2026-06-11 15:15:15 +02:00 · 2014-02-03 15:01:40 +00:00 · 2014-02-03 15:01:40 +00:00 · ca632d1389
commit ca632d1389
parent 54a9ff2a06
1 changed files with 90 additions and 5 deletions
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -453,14 +453,99 @@ def swiss_roll_generated(num_samples=1000, sigma=0.0):

 def hapmap3(data_set='hapmap3'):
    try:
-        from pandas import read_pickle
+        from pandas import read_pickle, DataFrame
+        from sys import stdout
+        import bz2
    except ImportError as i:
-        raise i, "Need pandas for hapmap dataset, make sure to install pandas before loading the hapmap dataset"
+        raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
    if not data_available(data_set):
        download_data(data_set)
-    snpsdf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.snps.pickle'))
-    metadf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.info.pickle'))
-    inandf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.nan.pickle'))
+    dirpath = os.path.join(data_path,'hapmap3')
+    hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly'
+    preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \
+                               ['.snps.pickle',
+                                '.info.pickle',
+                                '.nan.pickle']]
+    if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)):
+        if not overide_manual_authorize and prompt_user("Preprocessing requires 17GB of memory and can take alot of time, continue? [Y/n]\n"):
+            print "Preprocessing required for further usage."
+            return
+        status = "Preprocessing data, please be patient..."
+        print status
+        def write_status(message, progress, status):
+            stdout.write(" "*len(status)); stdout.write("\r"); stdout.flush()
+            status = r"[{perc: <{ll}}] {message: <13s}".format(message=message, ll=20,
+                                                               perc="="*int(20.*progress/100.))
+            stdout.write(status); stdout.flush()
+            return status
+        unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']]
+        if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)):
+            status=write_status('unpacking...', 0, '')
+            curr = 0
+            for newfilepath in unpacked_files:
+                if not os.path.exists(newfilepath):
+                    filepath = newfilepath + '.bz2'
+                    file_size = os.path.getsize(filepath)
+                    with open(newfilepath, 'wb') as new_file, open(filepath, 'rb') as f:
+                        decomp = bz2.BZ2Decompressor()
+                        file_processed = 0
+                        buffsize = 100 * 1024
+                        for data in iter(lambda : f.read(buffsize), b''):
+                            new_file.write(decomp.decompress(data))
+                            file_processed += len(data)
+                            write_status('unpacking...', curr+12.*file_processed/(file_size), status)
+                curr += 12
+                status=write_status('unpacking...', curr, status)
+        status=write_status('reading .ped...', 25, status)
+        # Preprocess data:    
+        snpstrnp = np.loadtxt('hapmap3_r2_b36_fwd.consensus.qc.poly.ped', dtype=str)
+        status=write_status('reading .map...', 33, status)
+        mapnp = np.loadtxt('hapmap3_r2_b36_fwd.consensus.qc.poly.map', dtype=str)
+        status=write_status('reading relationships.txt...', 42, status)
+        # and metainfo:
+        infodf = DataFrame.from_csv('./relationships_w_pops_121708.txt', header=0, sep='\t')
+        infodf.set_index('IID', inplace=1)
+        status=write_status('filtering nan...', 45, status)
+        snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2)
+        inan = snpstr[:,:,0] == '0'
+        status=write_status('filtering reference alleles...', 55, status)
+        ref = np.array(map(lambda x: np.unique(x)[-2:], snpstr.swapaxes(0,1)[:,:,:]))
+        status=write_status('encoding snps...', 70, status)
+        # Encode the information for each gene in {-1,0,1}:
+        status=write_status('encoding snps...', 73, status)
+        snps = (snpstr==ref[:,:,None])
+        status=write_status('encoding snps...', 76, status)
+        snps = (snps*np.array([1,-1])[None,None,:])
+        status=write_status('encoding snps...', 78, status)
+        snps = snps.sum(-1)
+        status=write_status('encoding snps', 81, status)
+        snps = snps.astype('S1')
+        status=write_status('marking nan values...', 88, status)
+        # put in nan values (masked as -128):
+        snps[inan] = -128
+        status=write_status('setting up meta...', 94, status)
+        # get meta information:
+        metaheader = np.r_[['family_id', 'iid', 'paternal_id', 'maternal_id', 'sex', 'phenotype']]
+        metadf = DataFrame(columns=metaheader, data=snpstrnp[:,:6])
+        metadf.set_index('iid', inplace=1)
+        metadf = metadf.join(infodf.population)
+        metadf.to_pickle(preprocessed_data_paths[1])
+        # put everything together:
+        status=write_status('setting up snps...', 96, status)
+        snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:,1])
+        snpsdf.to_pickle(preprocessed_data_paths[0])
+        status=write_status('setting up snps...', 98, status)
+        inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:,1])
+        inandf.to_pickle(preprocessed_data_paths[2])
+        status=write_status('done :)', 100, status)
+        print ''
+    else:
+        print "loading snps..."
+        snpsdf = read_pickle(preprocessed_data_paths[0])
+        print "loading metainfo..."
+        metadf = read_pickle(preprocessed_data_paths[1])
+        print "loading nan entries..."
+        inandf = read_pickle(preprocessed_data_paths[2])
    snps = snpsdf.values
    populations = metadf.population.values.astype('S3')
    hapmap = dict(name=data_set,