diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index 5f07a0e9..b49ef498 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -123,11 +123,39 @@ ] ], "license": null, - "size": 0, + "size": 20258, "urls": [ "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/drosophila_protein/" ] }, + "spellman_yeast": { + "citation": "Paul T. Spellman, Gavin Sherlock, Michael Q. Zhang, Vishwanath R. Iyer, Kirk Anders, Michael B. Eisen, Patrick O. Brown, David Botstein, and Bruce Futcher 'Comprehensive Identification of Cell Cycle-regulated Genes of the Yeast Saccharomyces cerevisiae by Microarray Hybridization.' Molecular Biology of the Cell 9, 3273-3297", + "details": "Two colour spotted cDNA array data set of a series of experiments to identify which genes in Yeast are cell cycle regulated.", + "files": [ + [ + "combined.txt" + ] + ], + "license": null, + "size": 2510955, + "urls": [ + "http://genome-www.stanford.edu/cellcycle/data/rawdata/" + ] + }, + "lee_yeast_ChIP": { + "citation": "Tong Ihn Lee, Nicola J. Rinaldi, Francois Robert, Duncan T. Odom, Ziv Bar-Joseph, Georg K. Gerber, Nancy M. Hannett, Christopher T. Harbison, Craig M. Thompson, Itamar Simon, Julia Zeitlinger, Ezra G. Jennings, Heather L. Murray, D. Benjamin Gordon, Bing Ren, John J. Wyrick, Jean-Bosco Tagne, Thomas L. Volkert, Ernest Fraenkel, David K. Gifford, Richard A. Young 'Transcriptional Regulatory Networks in Saccharomyces cerevisiae' Science 298 (5594) pg 799--804. DOI: 10.1126/science.1075090", + "details": "Binding location analysis for 106 regulators in yeast. The data consists of p-values for binding of regulators to genes derived from ChIP-chip experiments.", + "files": [ + [ + "binding_by_gene.tsv" + ] + ], + "license": null, + "size": 1674161, + "urls": [ + "http://jura.wi.mit.edu/young_public/regulatory_network/" + ] + }, "epomeo_gpx": { "citation": "", "details": "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index 4d89ece2..06532438 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -375,7 +375,43 @@ def sod1_mouse(data_set='sod1_mouse'): num_cond=4 X = 1 return data_details_return({'X': X, 'Y': Y}, data_set) + +def spellman_yeast(data_set='spellman_yeast'): + if not data_available(data_set): + download_data(data_set) + from pandas import read_csv + dirpath = os.path.join(data_path, data_set) + filename = os.path.join(dirpath, 'combined.txt') + Y = read_csv(filename, header=0, index_col=0, sep='\t') + return data_details_return({'Y': Y}, data_set) + +def spellman_yeast_cdc(data_set='spellman_yeast'): + if not data_available(data_set): + download_data(data_set) + from pandas import read_csv + dirpath = os.path.join(data_path, data_set) + filename = os.path.join(dirpath, 'combined.txt') + Y = read_csv(filename, header=0, index_col=0, sep='\t') + t = np.asarray([10, 30, 50, 70, 80, 90, 100, 110, 120, 130, 140, 150, 170, 180, 190, 200, 210, 220, 230, 240, 250, 270, 290]) + times = ['cdc15_'+str(time) for time in t] + Y = Y[times].T + t = t[:, None] + return data_details_return({'Y' : Y, 't': t, 'info': 'Time series of synchronized yeast cells from the CDC-15 experiment of Spellman et al (1998).'}, data_set) + +def lee_yeast_ChIP(data_set='lee_yeast_ChIP'): + if not data_available(data_set): + download_data(data_set) + from pandas import read_csv + import zipfile + dirpath = os.path.join(data_path, data_set) + filename = os.path.join(dirpath, 'binding_by_gene.tsv') + X = read_csv(filename, header=1, index_col=0, sep='\t') + transcription_factors = [col for col in X.columns if col[:7] != 'Unnamed'] + annotations = X[['Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3']] + X = X[transcription_factors] + return data_details_return({'annotations' : annotations, 'X' : X, 'transcription_factors', transcription_factors}, data_set) + def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None): if not data_available(data_set): download_data(data_set)