[datasets] merged hapmap dataset into params

2026-07-11 16:22:13 +02:00 · 2014-04-16 15:37:50 +01:00 · 2014-04-16 15:37:50 +01:00 · c17791a12c
commit c17791a12c
parent 5fb9ce9c53
4 changed files with 227 additions and 423 deletions
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -192,17 +192,22 @@ class VarDTC(object):

 class VarDTCMissingData(object):
    const_jitter = 1e-6
-    def __init__(self, limit=1):
+    def __init__(self, limit=1, inan=None):
        from ...util.caching import Cacher
        self._Y = Cacher(self._subarray_computations, limit)
+        self._inan = inan
        pass

    def set_limit(self, limit):
        self._Y.limit = limit

    def _subarray_computations(self, Y):
-        inan = np.isnan(Y)
-        has_none = inan.any()
+        if self._inan is None:
+            inan = np.isnan(Y)
+            has_none = inan.any()
+        else:
+            inan = self._inan
+            has_none = True
        if has_none:
            from ...util.subarray_and_sorting import common_subarrays
            self._subarray_indices = []
--- a/GPy/util/data_resources.json
+++ b/GPy/util/data_resources.json
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -106,9 +106,30 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix
            raise ValueError('Tried url ' + url + suffix + ' and received client error ' + str(response.code))
        elif response.code > 499:
            raise ValueError('Tried url ' + url + suffix + ' and received server error ' + str(response.code))
-    # if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
    with open(save_name, 'wb') as f:
-        f.write(response.read())
+        meta = response.info()
+        file_size = int(meta.getheaders("Content-Length")[0])
+        status = ""
+        file_size_dl = 0
+        block_sz = 8192
+        line_length=30
+        while True:
+            buff = response.read(block_sz)
+            if not buff:
+                break
+            file_size_dl += len(buff)
+            f.write(buff)
+            sys.stdout.write(" "*(len(status)) + "\r")
+            status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1.*1e6), 
+                                                                       full=file_size/(1.*1e6), ll=line_length, 
+                                                                       perc="="*int(line_length*float(file_size_dl)/file_size))
+            sys.stdout.write(status)
+            sys.stdout.flush()
+        sys.stdout.write(" "*(len(status)) + "\r")
+        print status
+    # if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
+    #with open(save_name, 'wb') as f:
+    #    f.write(response.read())

    #urllib.urlretrieve(url+suffix, save_name, reporthook)

@ -552,6 +573,151 @@ def swiss_roll_generated(num_samples=1000, sigma=0.0):
    c = c[so, :]
    return {'Y':Y, 't':t, 'colors':c}

+def hapmap3(data_set='hapmap3'):
+    """
+    The HapMap phase three SNP dataset - 1184 samples out of 11 populations.
+
+    SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:
+    Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then
+
+          /  1, iff SNPij==(B1,B1)
+    Aij = |  0, iff SNPij==(B1,B2)
+          \ -1, iff SNPij==(B2,B2)
+
+    The SNP data and the meta information (such as iid, sex and phenotype) are
+    stored in the dataframe datadf, index is the Individual ID, 
+    with following columns for metainfo:
+
+        * family_id   -> Family ID
+        * paternal_id -> Paternal ID
+        * maternal_id -> Maternal ID
+        * sex         -> Sex (1=male; 2=female; other=unknown)
+        * phenotype   -> Phenotype (-9, or 0 for unknown)
+        * population  -> Population string (e.g. 'ASW' - 'YRI')
+        * rest are SNP rs (ids)
+
+    More information is given in infodf:
+
+        * Chromosome:
+            - autosomal chromosemes                -> 1-22
+            - X    X chromosome                    -> 23
+            - Y    Y chromosome                    -> 24
+            - XY   Pseudo-autosomal region of X    -> 25
+            - MT   Mitochondrial                   -> 26
+        * Relative Positon (to Chromosome) [base pairs]
+    """
+    try:
+        from pandas import read_pickle, DataFrame
+        from sys import stdout
+        import bz2
+    except ImportError as i:
+        raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
+    if not data_available(data_set):
+        download_data(data_set)
+    dirpath = os.path.join(data_path,'hapmap3')
+    hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly'
+    preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \
+                               ['.snps.pickle',
+                                '.info.pickle',
+                                '.nan.pickle']]
+    if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)):
+        if not overide_manual_authorize and not prompt_user("Preprocessing requires ~25GB "
+                            "of memory and can take a (very) long time, continue? [Y/n]"):
+            print "Preprocessing required for further usage."
+            return
+        status = "Preprocessing data, please be patient..."
+        print status
+        def write_status(message, progress, status):
+            stdout.write(" "*len(status)); stdout.write("\r"); stdout.flush()
+            status = r"[{perc: <{ll}}] {message: <13s}".format(message=message, ll=20,
+                                                               perc="="*int(20.*progress/100.))
+            stdout.write(status); stdout.flush()
+            return status
+        unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']]
+        if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)):
+            status=write_status('unpacking...', 0, '')
+            curr = 0
+            for newfilepath in unpacked_files:
+                if not os.path.exists(newfilepath):
+                    filepath = newfilepath + '.bz2'
+                    file_size = os.path.getsize(filepath)
+                    with open(newfilepath, 'wb') as new_file, open(filepath, 'rb') as f:
+                        decomp = bz2.BZ2Decompressor()
+                        file_processed = 0
+                        buffsize = 100 * 1024
+                        for data in iter(lambda : f.read(buffsize), b''):
+                            new_file.write(decomp.decompress(data))
+                            file_processed += len(data)
+                            status=write_status('unpacking...', curr+12.*file_processed/(file_size), status)
+                curr += 12
+                status=write_status('unpacking...', curr, status)
+        status=write_status('reading .ped...', 25, status)
+        # Preprocess data:    
+        snpstrnp = np.loadtxt(unpacked_files[0], dtype=str)
+        status=write_status('reading .map...', 33, status)
+        mapnp = np.loadtxt(unpacked_files[1], dtype=str)
+        status=write_status('reading relationships.txt...', 42, status)
+        # and metainfo:
+        infodf = DataFrame.from_csv(os.path.join(dirpath,'./relationships_w_pops_121708.txt'), header=0, sep='\t')
+        infodf.set_index('IID', inplace=1)
+        status=write_status('filtering nan...', 45, status)
+        snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2)
+        inan = snpstr[:,:,0] == '0'
+        status=write_status('filtering reference alleles...', 55, status)
+        ref = np.array(map(lambda x: np.unique(x)[-2:], snpstr.swapaxes(0,1)[:,:,:]))
+        status=write_status('encoding snps...', 70, status)
+        # Encode the information for each gene in {-1,0,1}:
+        status=write_status('encoding snps...', 73, status)
+        snps = (snpstr==ref[None,:,:])
+        status=write_status('encoding snps...', 76, status)
+        snps = (snps*np.array([1,-1])[None,None,:])
+        status=write_status('encoding snps...', 78, status)
+        snps = snps.sum(-1)
+        status=write_status('encoding snps...', 81, status)
+        snps = snps.astype('i8')
+        status=write_status('marking nan values...', 88, status)
+        # put in nan values (masked as -128):
+        snps[inan] = -128
+        status=write_status('setting up meta...', 94, status)
+        # get meta information:
+        metaheader = np.r_[['family_id', 'iid', 'paternal_id', 'maternal_id', 'sex', 'phenotype']]
+        metadf = DataFrame(columns=metaheader, data=snpstrnp[:,:6])
+        metadf.set_index('iid', inplace=1)
+        metadf = metadf.join(infodf.population)
+        metadf.to_pickle(preprocessed_data_paths[1])
+        # put everything together:
+        status=write_status('setting up snps...', 96, status)
+        snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:,1])
+        with open(preprocessed_data_paths[0], 'wb') as f:
+            pickle.dump(f, snpsdf, protocoll=-1)
+        status=write_status('setting up snps...', 98, status)
+        inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:,1])
+        inandf.to_pickle(preprocessed_data_paths[2])
+        status=write_status('done :)', 100, status)
+        print ''
+    else:
+        print "loading snps..."
+        snpsdf = read_pickle(preprocessed_data_paths[0])
+        print "loading metainfo..."
+        metadf = read_pickle(preprocessed_data_paths[1])
+        print "loading nan entries..."
+        inandf = read_pickle(preprocessed_data_paths[2])
+    snps = snpsdf.values
+    populations = metadf.population.values.astype('S3')
+    hapmap = dict(name=data_set,
+                  description='The HapMap phase three SNP dataset - '
+                  '1184 samples out of 11 populations. inan is a '
+                  'boolean array, containing wheather or not the '
+                  'given entry is nan (nans are masked as '
+                  '-128 in snps).',
+                  snpsdf=snpsdf,
+                  metadf=metadf,
+                  snps=snps,
+                  inan=inandf.values,
+                  inandf=inandf,
+                  populations=populations)
+    return hapmap
+    
 def swiss_roll_1000():
    return swiss_roll(num_samples=1000)

--- a/GPy/util/datasets/data_resources_create.py
+++ b/GPy/util/datasets/data_resources_create.py
@ -24,12 +24,12 @@ data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
                                     'license': None,
                                     'size' : 1100584},
                  'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'],
-                                 'files' : [['allasfamc.zip']],
-                                 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.
-The database was created with funding from NSF EIA-0196217.""",
-                                 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""",
-                                 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""",
-                                 'size' : None},
+                                      'files' : [['allasfamc.zip']],
+                                      'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.'
+                                      'The database was created with funding from NSF EIA-0196217.""",
+                                      'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""",
+                                      'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""",
+                                      'size' : None},
                  'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'],
                                     'files' : [['creeprupt.tar']],
                                     'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.',
@ -120,8 +120,49 @@ The database was created with funding from NSF EIA-0196217.""",
                                        'details' : """Accelerometer pen data used for robust regression by Tipping and Lawrence.""",
                                        'citation' : 'Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005',
                                        'license' : None,
-                                        'size' : 3410}
+                                        'size' : 3410},
+                  'hapmap3' : {'urls' : ['http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/'],
+                                 'files' : [['hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2', 'hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2', 'relationships_w_pops_121708.txt']],
+                                 'details' : """
+        HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. 
+        The HapMap phase three SNP dataset - 1184 samples out of 11 populations.
+        See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.
+
+        SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:
+        Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then
+
+              /  1, iff SNPij==(B1,B1)
+        Aij = |  0, iff SNPij==(B1,B2)
+              \ -1, iff SNPij==(B2,B2)
+
+        The SNP data and the meta information (such as iid, sex and phenotype) are
+        stored in the dataframe datadf, index is the Individual ID, 
+        with following columns for metainfo:
+
+            * family_id   -> Family ID
+            * paternal_id -> Paternal ID
+            * maternal_id -> Maternal ID
+            * sex         -> Sex (1=male; 2=female; other=unknown)
+            * phenotype   -> Phenotype (-9, or 0 for unknown)
+            * population  -> Population string (e.g. 'ASW' - 'YRI')
+            * rest are SNP rs (ids)
+
+        More information is given in infodf:
+
+            * Chromosome:
+                - autosomal chromosemes                -> 1-22
+                - X    X chromosome                    -> 23
+                - Y    Y chromosome                    -> 24
+                - XY   Pseudo-autosomal region of X    -> 25
+                - MT   Mitochondrial                   -> 26
+            * Relative Positon (to Chromosome) [base pairs]
+
+        """,
+                                 'citation': """Gibbs, Richard A., et al. "The international HapMap project." Nature 426.6968 (2003): 789-796.""",
+                                 'license' : """International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)""",
+                                 'size' : 2*1729092237 + 62265},
                  }

-with open('data_resources.json', 'w') as file:
-    json.dump(data_resources, file)
+with open('data_resources.json', 'w') as f:
+    print "writing data_resources"
+    json.dump(data_resources, f)