[datasets] merged hapmap dataset into params

2026-05-01 07:46:22 +02:00 · 2014-04-16 15:37:50 +01:00 · 2014-04-16 15:37:50 +01:00 · c17791a12c
commit c17791a12c
parent 5fb9ce9c53
4 changed files with 227 additions and 423 deletions
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -106,9 +106,30 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix
            raise ValueError('Tried url ' + url + suffix + ' and received client error ' + str(response.code))
        elif response.code > 499:
            raise ValueError('Tried url ' + url + suffix + ' and received server error ' + str(response.code))
-    # if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
    with open(save_name, 'wb') as f:
-        f.write(response.read())
+        meta = response.info()
+        file_size = int(meta.getheaders("Content-Length")[0])
+        status = ""
+        file_size_dl = 0
+        block_sz = 8192
+        line_length=30
+        while True:
+            buff = response.read(block_sz)
+            if not buff:
+                break
+            file_size_dl += len(buff)
+            f.write(buff)
+            sys.stdout.write(" "*(len(status)) + "\r")
+            status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1.*1e6), 
+                                                                       full=file_size/(1.*1e6), ll=line_length, 
+                                                                       perc="="*int(line_length*float(file_size_dl)/file_size))
+            sys.stdout.write(status)
+            sys.stdout.flush()
+        sys.stdout.write(" "*(len(status)) + "\r")
+        print status
+    # if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
+    #with open(save_name, 'wb') as f:
+    #    f.write(response.read())

    #urllib.urlretrieve(url+suffix, save_name, reporthook)

@ -552,6 +573,151 @@ def swiss_roll_generated(num_samples=1000, sigma=0.0):
    c = c[so, :]
    return {'Y':Y, 't':t, 'colors':c}

+def hapmap3(data_set='hapmap3'):
+    """
+    The HapMap phase three SNP dataset - 1184 samples out of 11 populations.
+
+    SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:
+    Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then
+
+          /  1, iff SNPij==(B1,B1)
+    Aij = |  0, iff SNPij==(B1,B2)
+          \ -1, iff SNPij==(B2,B2)
+
+    The SNP data and the meta information (such as iid, sex and phenotype) are
+    stored in the dataframe datadf, index is the Individual ID, 
+    with following columns for metainfo:
+
+        * family_id   -> Family ID
+        * paternal_id -> Paternal ID
+        * maternal_id -> Maternal ID
+        * sex         -> Sex (1=male; 2=female; other=unknown)
+        * phenotype   -> Phenotype (-9, or 0 for unknown)
+        * population  -> Population string (e.g. 'ASW' - 'YRI')
+        * rest are SNP rs (ids)
+
+    More information is given in infodf:
+
+        * Chromosome:
+            - autosomal chromosemes                -> 1-22
+            - X    X chromosome                    -> 23
+            - Y    Y chromosome                    -> 24
+            - XY   Pseudo-autosomal region of X    -> 25
+            - MT   Mitochondrial                   -> 26
+        * Relative Positon (to Chromosome) [base pairs]
+    """
+    try:
+        from pandas import read_pickle, DataFrame
+        from sys import stdout
+        import bz2
+    except ImportError as i:
+        raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
+    if not data_available(data_set):
+        download_data(data_set)
+    dirpath = os.path.join(data_path,'hapmap3')
+    hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly'
+    preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \
+                               ['.snps.pickle',
+                                '.info.pickle',
+                                '.nan.pickle']]
+    if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)):
+        if not overide_manual_authorize and not prompt_user("Preprocessing requires ~25GB "
+                            "of memory and can take a (very) long time, continue? [Y/n]"):
+            print "Preprocessing required for further usage."
+            return
+        status = "Preprocessing data, please be patient..."
+        print status
+        def write_status(message, progress, status):
+            stdout.write(" "*len(status)); stdout.write("\r"); stdout.flush()
+            status = r"[{perc: <{ll}}] {message: <13s}".format(message=message, ll=20,
+                                                               perc="="*int(20.*progress/100.))
+            stdout.write(status); stdout.flush()
+            return status
+        unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']]
+        if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)):
+            status=write_status('unpacking...', 0, '')
+            curr = 0
+            for newfilepath in unpacked_files:
+                if not os.path.exists(newfilepath):
+                    filepath = newfilepath + '.bz2'
+                    file_size = os.path.getsize(filepath)
+                    with open(newfilepath, 'wb') as new_file, open(filepath, 'rb') as f:
+                        decomp = bz2.BZ2Decompressor()
+                        file_processed = 0
+                        buffsize = 100 * 1024
+                        for data in iter(lambda : f.read(buffsize), b''):
+                            new_file.write(decomp.decompress(data))
+                            file_processed += len(data)
+                            status=write_status('unpacking...', curr+12.*file_processed/(file_size), status)
+                curr += 12
+                status=write_status('unpacking...', curr, status)
+        status=write_status('reading .ped...', 25, status)
+        # Preprocess data:    
+        snpstrnp = np.loadtxt(unpacked_files[0], dtype=str)
+        status=write_status('reading .map...', 33, status)
+        mapnp = np.loadtxt(unpacked_files[1], dtype=str)
+        status=write_status('reading relationships.txt...', 42, status)
+        # and metainfo:
+        infodf = DataFrame.from_csv(os.path.join(dirpath,'./relationships_w_pops_121708.txt'), header=0, sep='\t')
+        infodf.set_index('IID', inplace=1)
+        status=write_status('filtering nan...', 45, status)
+        snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2)
+        inan = snpstr[:,:,0] == '0'
+        status=write_status('filtering reference alleles...', 55, status)
+        ref = np.array(map(lambda x: np.unique(x)[-2:], snpstr.swapaxes(0,1)[:,:,:]))
+        status=write_status('encoding snps...', 70, status)
+        # Encode the information for each gene in {-1,0,1}:
+        status=write_status('encoding snps...', 73, status)
+        snps = (snpstr==ref[None,:,:])
+        status=write_status('encoding snps...', 76, status)
+        snps = (snps*np.array([1,-1])[None,None,:])
+        status=write_status('encoding snps...', 78, status)
+        snps = snps.sum(-1)
+        status=write_status('encoding snps...', 81, status)
+        snps = snps.astype('i8')
+        status=write_status('marking nan values...', 88, status)
+        # put in nan values (masked as -128):
+        snps[inan] = -128
+        status=write_status('setting up meta...', 94, status)
+        # get meta information:
+        metaheader = np.r_[['family_id', 'iid', 'paternal_id', 'maternal_id', 'sex', 'phenotype']]
+        metadf = DataFrame(columns=metaheader, data=snpstrnp[:,:6])
+        metadf.set_index('iid', inplace=1)
+        metadf = metadf.join(infodf.population)
+        metadf.to_pickle(preprocessed_data_paths[1])
+        # put everything together:
+        status=write_status('setting up snps...', 96, status)
+        snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:,1])
+        with open(preprocessed_data_paths[0], 'wb') as f:
+            pickle.dump(f, snpsdf, protocoll=-1)
+        status=write_status('setting up snps...', 98, status)
+        inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:,1])
+        inandf.to_pickle(preprocessed_data_paths[2])
+        status=write_status('done :)', 100, status)
+        print ''
+    else:
+        print "loading snps..."
+        snpsdf = read_pickle(preprocessed_data_paths[0])
+        print "loading metainfo..."
+        metadf = read_pickle(preprocessed_data_paths[1])
+        print "loading nan entries..."
+        inandf = read_pickle(preprocessed_data_paths[2])
+    snps = snpsdf.values
+    populations = metadf.population.values.astype('S3')
+    hapmap = dict(name=data_set,
+                  description='The HapMap phase three SNP dataset - '
+                  '1184 samples out of 11 populations. inan is a '
+                  'boolean array, containing wheather or not the '
+                  'given entry is nan (nans are masked as '
+                  '-128 in snps).',
+                  snpsdf=snpsdf,
+                  metadf=metadf,
+                  snps=snps,
+                  inan=inandf.values,
+                  inandf=inandf,
+                  populations=populations)
+    return hapmap
+    
 def swiss_roll_1000():
    return swiss_roll(num_samples=1000)