HapMap3 dataset added

This commit is contained in:
Max Zwiessele 2014-02-03 15:01:40 +00:00
parent 54a9ff2a06
commit ca632d1389

View file

@ -453,14 +453,99 @@ def swiss_roll_generated(num_samples=1000, sigma=0.0):
def hapmap3(data_set='hapmap3'):
try:
from pandas import read_pickle
from pandas import read_pickle, DataFrame
from sys import stdout
import bz2
except ImportError as i:
raise i, "Need pandas for hapmap dataset, make sure to install pandas before loading the hapmap dataset"
raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
if not data_available(data_set):
download_data(data_set)
snpsdf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.snps.pickle'))
metadf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.info.pickle'))
inandf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.nan.pickle'))
dirpath = os.path.join(data_path,'hapmap3')
hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly'
preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \
['.snps.pickle',
'.info.pickle',
'.nan.pickle']]
if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)):
if not overide_manual_authorize and prompt_user("Preprocessing requires 17GB of memory and can take alot of time, continue? [Y/n]\n"):
print "Preprocessing required for further usage."
return
status = "Preprocessing data, please be patient..."
print status
def write_status(message, progress, status):
stdout.write(" "*len(status)); stdout.write("\r"); stdout.flush()
status = r"[{perc: <{ll}}] {message: <13s}".format(message=message, ll=20,
perc="="*int(20.*progress/100.))
stdout.write(status); stdout.flush()
return status
unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']]
if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)):
status=write_status('unpacking...', 0, '')
curr = 0
for newfilepath in unpacked_files:
if not os.path.exists(newfilepath):
filepath = newfilepath + '.bz2'
file_size = os.path.getsize(filepath)
with open(newfilepath, 'wb') as new_file, open(filepath, 'rb') as f:
decomp = bz2.BZ2Decompressor()
file_processed = 0
buffsize = 100 * 1024
for data in iter(lambda : f.read(buffsize), b''):
new_file.write(decomp.decompress(data))
file_processed += len(data)
write_status('unpacking...', curr+12.*file_processed/(file_size), status)
curr += 12
status=write_status('unpacking...', curr, status)
status=write_status('reading .ped...', 25, status)
# Preprocess data:
snpstrnp = np.loadtxt('hapmap3_r2_b36_fwd.consensus.qc.poly.ped', dtype=str)
status=write_status('reading .map...', 33, status)
mapnp = np.loadtxt('hapmap3_r2_b36_fwd.consensus.qc.poly.map', dtype=str)
status=write_status('reading relationships.txt...', 42, status)
# and metainfo:
infodf = DataFrame.from_csv('./relationships_w_pops_121708.txt', header=0, sep='\t')
infodf.set_index('IID', inplace=1)
status=write_status('filtering nan...', 45, status)
snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2)
inan = snpstr[:,:,0] == '0'
status=write_status('filtering reference alleles...', 55, status)
ref = np.array(map(lambda x: np.unique(x)[-2:], snpstr.swapaxes(0,1)[:,:,:]))
status=write_status('encoding snps...', 70, status)
# Encode the information for each gene in {-1,0,1}:
status=write_status('encoding snps...', 73, status)
snps = (snpstr==ref[:,:,None])
status=write_status('encoding snps...', 76, status)
snps = (snps*np.array([1,-1])[None,None,:])
status=write_status('encoding snps...', 78, status)
snps = snps.sum(-1)
status=write_status('encoding snps', 81, status)
snps = snps.astype('S1')
status=write_status('marking nan values...', 88, status)
# put in nan values (masked as -128):
snps[inan] = -128
status=write_status('setting up meta...', 94, status)
# get meta information:
metaheader = np.r_[['family_id', 'iid', 'paternal_id', 'maternal_id', 'sex', 'phenotype']]
metadf = DataFrame(columns=metaheader, data=snpstrnp[:,:6])
metadf.set_index('iid', inplace=1)
metadf = metadf.join(infodf.population)
metadf.to_pickle(preprocessed_data_paths[1])
# put everything together:
status=write_status('setting up snps...', 96, status)
snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:,1])
snpsdf.to_pickle(preprocessed_data_paths[0])
status=write_status('setting up snps...', 98, status)
inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:,1])
inandf.to_pickle(preprocessed_data_paths[2])
status=write_status('done :)', 100, status)
print ''
else:
print "loading snps..."
snpsdf = read_pickle(preprocessed_data_paths[0])
print "loading metainfo..."
metadf = read_pickle(preprocessed_data_paths[1])
print "loading nan entries..."
inandf = read_pickle(preprocessed_data_paths[2])
snps = snpsdf.values
populations = metadf.population.values.astype('S3')
hapmap = dict(name=data_set,