diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index 9167a570..30c09eaa 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -453,14 +453,99 @@ def swiss_roll_generated(num_samples=1000, sigma=0.0): def hapmap3(data_set='hapmap3'): try: - from pandas import read_pickle + from pandas import read_pickle, DataFrame + from sys import stdout + import bz2 except ImportError as i: - raise i, "Need pandas for hapmap dataset, make sure to install pandas before loading the hapmap dataset" + raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset" if not data_available(data_set): download_data(data_set) - snpsdf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.snps.pickle')) - metadf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.info.pickle')) - inandf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.nan.pickle')) + dirpath = os.path.join(data_path,'hapmap3') + hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly' + preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \ + ['.snps.pickle', + '.info.pickle', + '.nan.pickle']] + if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)): + if not overide_manual_authorize and prompt_user("Preprocessing requires 17GB of memory and can take alot of time, continue? [Y/n]\n"): + print "Preprocessing required for further usage." + return + status = "Preprocessing data, please be patient..." + print status + def write_status(message, progress, status): + stdout.write(" "*len(status)); stdout.write("\r"); stdout.flush() + status = r"[{perc: <{ll}}] {message: <13s}".format(message=message, ll=20, + perc="="*int(20.*progress/100.)) + stdout.write(status); stdout.flush() + return status + unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']] + if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)): + status=write_status('unpacking...', 0, '') + curr = 0 + for newfilepath in unpacked_files: + if not os.path.exists(newfilepath): + filepath = newfilepath + '.bz2' + file_size = os.path.getsize(filepath) + with open(newfilepath, 'wb') as new_file, open(filepath, 'rb') as f: + decomp = bz2.BZ2Decompressor() + file_processed = 0 + buffsize = 100 * 1024 + for data in iter(lambda : f.read(buffsize), b''): + new_file.write(decomp.decompress(data)) + file_processed += len(data) + write_status('unpacking...', curr+12.*file_processed/(file_size), status) + curr += 12 + status=write_status('unpacking...', curr, status) + status=write_status('reading .ped...', 25, status) + # Preprocess data: + snpstrnp = np.loadtxt('hapmap3_r2_b36_fwd.consensus.qc.poly.ped', dtype=str) + status=write_status('reading .map...', 33, status) + mapnp = np.loadtxt('hapmap3_r2_b36_fwd.consensus.qc.poly.map', dtype=str) + status=write_status('reading relationships.txt...', 42, status) + # and metainfo: + infodf = DataFrame.from_csv('./relationships_w_pops_121708.txt', header=0, sep='\t') + infodf.set_index('IID', inplace=1) + status=write_status('filtering nan...', 45, status) + snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2) + inan = snpstr[:,:,0] == '0' + status=write_status('filtering reference alleles...', 55, status) + ref = np.array(map(lambda x: np.unique(x)[-2:], snpstr.swapaxes(0,1)[:,:,:])) + status=write_status('encoding snps...', 70, status) + # Encode the information for each gene in {-1,0,1}: + status=write_status('encoding snps...', 73, status) + snps = (snpstr==ref[:,:,None]) + status=write_status('encoding snps...', 76, status) + snps = (snps*np.array([1,-1])[None,None,:]) + status=write_status('encoding snps...', 78, status) + snps = snps.sum(-1) + status=write_status('encoding snps', 81, status) + snps = snps.astype('S1') + status=write_status('marking nan values...', 88, status) + # put in nan values (masked as -128): + snps[inan] = -128 + status=write_status('setting up meta...', 94, status) + # get meta information: + metaheader = np.r_[['family_id', 'iid', 'paternal_id', 'maternal_id', 'sex', 'phenotype']] + metadf = DataFrame(columns=metaheader, data=snpstrnp[:,:6]) + metadf.set_index('iid', inplace=1) + metadf = metadf.join(infodf.population) + metadf.to_pickle(preprocessed_data_paths[1]) + # put everything together: + status=write_status('setting up snps...', 96, status) + snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:,1]) + snpsdf.to_pickle(preprocessed_data_paths[0]) + status=write_status('setting up snps...', 98, status) + inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:,1]) + inandf.to_pickle(preprocessed_data_paths[2]) + status=write_status('done :)', 100, status) + print '' + else: + print "loading snps..." + snpsdf = read_pickle(preprocessed_data_paths[0]) + print "loading metainfo..." + metadf = read_pickle(preprocessed_data_paths[1]) + print "loading nan entries..." + inandf = read_pickle(preprocessed_data_paths[2]) snps = snpsdf.values populations = metadf.population.values.astype('S3') hapmap = dict(name=data_set,