added hapmap download, need to put in data preprocessing for actual usability

This commit is contained in:
Max Zwiessele 2014-01-31 16:56:11 +00:00
parent 8ccc3e071e
commit 54a9ff2a06
3 changed files with 52 additions and 343 deletions

File diff suppressed because one or more lines are too long

View file

@ -37,7 +37,6 @@ if not (on_rtd):
json_data=open(path).read()
data_resources = json.loads(json_data)
def prompt_user(prompt):
"""Ask user for agreeing to data set licenses."""
# raw_input returns the empty string for "enter"
@ -94,9 +93,30 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix
raise ValueError('Tried url ' + url + suffix + ' and received client error ' + str(response.code))
elif response.code > 499:
raise ValueError('Tried url ' + url + suffix + ' and received server error ' + str(response.code))
# if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
with open(save_name, 'wb') as f:
f.write(response.read())
meta = response.info()
file_size = int(meta.getheaders("Content-Length")[0])
status = ""
file_size_dl = 0
block_sz = 8192
line_length=30
while True:
buff = response.read(block_sz)
if not buff:
break
file_size_dl += len(buff)
f.write(buff)
sys.stdout.write(" "*(len(status)) + "\r")
status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1.*1e6),
full=file_size/(1.*1e6), ll=line_length,
perc="="*int(line_length*float(file_size_dl)/file_size))
sys.stdout.write(status)
sys.stdout.flush()
sys.stdout.write(" "*(len(status)) + "\r")
print status
# if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
#with open(save_name, 'wb') as f:
# f.write(response.read())
#urllib.urlretrieve(url+suffix, save_name, reporthook)
@ -431,28 +451,29 @@ def swiss_roll_generated(num_samples=1000, sigma=0.0):
c = c[so, :]
return {'Y':Y, 't':t, 'colors':c}
def hapmapIII(data_set='hapmapIII'):
def hapmap3(data_set='hapmap3'):
try:
from pandas import read_pickle
except ImportError as i:
raise i, "Need pandas for hapmap dataset, make sure to install pandas before loading the hapmap dataset"
if not data_available(data_set):
download_data(data_set)
datadf = read_pickle(os.path.join(data_path,'HapMapIII','hapmap3_r2_b36_fwd.consensus.qc.poly.snps.pickle'))
infodf = read_pickle(os.path.join(data_path,'HapMapIII','hapmap3_r2_b36_fwd.consensus.qc.poly.info.pickle'))
inan = read_pickle(os.path.join(data_path,'HapMapIII','hapmap3_r2_b36_fwd.consensus.qc.poly.nan.pickle'))
snps = datadf.iloc[:,6:].values
populations = datadf.population.values.astype('S3')
hapmap = dict(name='HapMapIII',
describtion='The HapMap phase three SNP dataset - '
snpsdf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.snps.pickle'))
metadf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.info.pickle'))
inandf = read_pickle(os.path.join(data_path,'HapMap3','hapmap3_r2_b36_fwd.consensus.qc.poly.nan.pickle'))
snps = snpsdf.values
populations = metadf.population.values.astype('S3')
hapmap = dict(name=data_set,
description='The HapMap phase three SNP dataset - '
'1184 samples out of 11 populations. inan is a '
'boolean array, containing wheather or not the '
'given entry is nan (nans are masked as '
'-128 in snps).',
datadf=datadf,
infodf=infodf,
snpsdf=snpsdf,
metadf=metadf,
snps=snps,
inan=inan,
inan=inandf.values,
inandf=inandf,
populations=populations)
return hapmap

View file

@ -24,12 +24,12 @@ data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
'license': None,
'size' : 1100584},
'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'],
'files' : [['allasfamc.zip']],
'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.
The database was created with funding from NSF EIA-0196217.""",
'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""",
'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""",
'size' : None},
'files' : [['allasfamc.zip']],
'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.'
'The database was created with funding from NSF EIA-0196217.""",
'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""",
'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""",
'size' : None},
'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'],
'files' : [['creeprupt.tar']],
'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.',
@ -120,8 +120,15 @@ The database was created with funding from NSF EIA-0196217.""",
'details' : """Accelerometer pen data used for robust regression by Tipping and Lawrence.""",
'citation' : 'Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005',
'license' : None,
'size' : 3410}
'size' : 3410},
'hapmap3' : {'urls' : ['http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/'],
'files' : [['hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2', 'hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2', 'relationships_w_pops_121708.txt']],
'details' : """HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.""",
'citation': """Gibbs, Richard A., et al. "The international HapMap project." Nature 426.6968 (2003): 789-796.""",
'license' : """International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)""",
'size' : 2*1729092237 + 62265},
}
with open('data_resources.json', 'w') as file:
json.dump(data_resources, file)
with open('data_resources.json', 'w') as f:
print "writing data_resources"
json.dump(data_resources, f)