mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-06-11 15:15:15 +02:00
HapMap3 dataset added
This commit is contained in:
parent
54a9ff2a06
commit
ca632d1389
1 changed files with 90 additions and 5 deletions
|
|
@ -453,14 +453,99 @@ def swiss_roll_generated(num_samples=1000, sigma=0.0):
|
|||
|
||||
def hapmap3(data_set='hapmap3'):
|
||||
try:
|
||||
from pandas import read_pickle
|
||||
from pandas import read_pickle, DataFrame
|
||||
from sys import stdout
|
||||
import bz2
|
||||
except ImportError as i:
|
||||
raise i, "Need pandas for hapmap dataset, make sure to install pandas before loading the hapmap dataset"
|
||||
raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
|
||||
if not data_available(data_set):
|
||||
download_data(data_set)
|
||||
snpsdf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.snps.pickle'))
|
||||
metadf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.info.pickle'))
|
||||
inandf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.nan.pickle'))
|
||||
dirpath = os.path.join(data_path,'hapmap3')
|
||||
hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly'
|
||||
preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \
|
||||
['.snps.pickle',
|
||||
'.info.pickle',
|
||||
'.nan.pickle']]
|
||||
if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)):
|
||||
if not overide_manual_authorize and prompt_user("Preprocessing requires 17GB of memory and can take alot of time, continue? [Y/n]\n"):
|
||||
print "Preprocessing required for further usage."
|
||||
return
|
||||
status = "Preprocessing data, please be patient..."
|
||||
print status
|
||||
def write_status(message, progress, status):
|
||||
stdout.write(" "*len(status)); stdout.write("\r"); stdout.flush()
|
||||
status = r"[{perc: <{ll}}] {message: <13s}".format(message=message, ll=20,
|
||||
perc="="*int(20.*progress/100.))
|
||||
stdout.write(status); stdout.flush()
|
||||
return status
|
||||
unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']]
|
||||
if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)):
|
||||
status=write_status('unpacking...', 0, '')
|
||||
curr = 0
|
||||
for newfilepath in unpacked_files:
|
||||
if not os.path.exists(newfilepath):
|
||||
filepath = newfilepath + '.bz2'
|
||||
file_size = os.path.getsize(filepath)
|
||||
with open(newfilepath, 'wb') as new_file, open(filepath, 'rb') as f:
|
||||
decomp = bz2.BZ2Decompressor()
|
||||
file_processed = 0
|
||||
buffsize = 100 * 1024
|
||||
for data in iter(lambda : f.read(buffsize), b''):
|
||||
new_file.write(decomp.decompress(data))
|
||||
file_processed += len(data)
|
||||
write_status('unpacking...', curr+12.*file_processed/(file_size), status)
|
||||
curr += 12
|
||||
status=write_status('unpacking...', curr, status)
|
||||
status=write_status('reading .ped...', 25, status)
|
||||
# Preprocess data:
|
||||
snpstrnp = np.loadtxt('hapmap3_r2_b36_fwd.consensus.qc.poly.ped', dtype=str)
|
||||
status=write_status('reading .map...', 33, status)
|
||||
mapnp = np.loadtxt('hapmap3_r2_b36_fwd.consensus.qc.poly.map', dtype=str)
|
||||
status=write_status('reading relationships.txt...', 42, status)
|
||||
# and metainfo:
|
||||
infodf = DataFrame.from_csv('./relationships_w_pops_121708.txt', header=0, sep='\t')
|
||||
infodf.set_index('IID', inplace=1)
|
||||
status=write_status('filtering nan...', 45, status)
|
||||
snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2)
|
||||
inan = snpstr[:,:,0] == '0'
|
||||
status=write_status('filtering reference alleles...', 55, status)
|
||||
ref = np.array(map(lambda x: np.unique(x)[-2:], snpstr.swapaxes(0,1)[:,:,:]))
|
||||
status=write_status('encoding snps...', 70, status)
|
||||
# Encode the information for each gene in {-1,0,1}:
|
||||
status=write_status('encoding snps...', 73, status)
|
||||
snps = (snpstr==ref[:,:,None])
|
||||
status=write_status('encoding snps...', 76, status)
|
||||
snps = (snps*np.array([1,-1])[None,None,:])
|
||||
status=write_status('encoding snps...', 78, status)
|
||||
snps = snps.sum(-1)
|
||||
status=write_status('encoding snps', 81, status)
|
||||
snps = snps.astype('S1')
|
||||
status=write_status('marking nan values...', 88, status)
|
||||
# put in nan values (masked as -128):
|
||||
snps[inan] = -128
|
||||
status=write_status('setting up meta...', 94, status)
|
||||
# get meta information:
|
||||
metaheader = np.r_[['family_id', 'iid', 'paternal_id', 'maternal_id', 'sex', 'phenotype']]
|
||||
metadf = DataFrame(columns=metaheader, data=snpstrnp[:,:6])
|
||||
metadf.set_index('iid', inplace=1)
|
||||
metadf = metadf.join(infodf.population)
|
||||
metadf.to_pickle(preprocessed_data_paths[1])
|
||||
# put everything together:
|
||||
status=write_status('setting up snps...', 96, status)
|
||||
snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:,1])
|
||||
snpsdf.to_pickle(preprocessed_data_paths[0])
|
||||
status=write_status('setting up snps...', 98, status)
|
||||
inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:,1])
|
||||
inandf.to_pickle(preprocessed_data_paths[2])
|
||||
status=write_status('done :)', 100, status)
|
||||
print ''
|
||||
else:
|
||||
print "loading snps..."
|
||||
snpsdf = read_pickle(preprocessed_data_paths[0])
|
||||
print "loading metainfo..."
|
||||
metadf = read_pickle(preprocessed_data_paths[1])
|
||||
print "loading nan entries..."
|
||||
inandf = read_pickle(preprocessed_data_paths[2])
|
||||
snps = snpsdf.values
|
||||
populations = metadf.population.values.astype('S3')
|
||||
hapmap = dict(name=data_set,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue