From 1c1c6008a96a209612e33e9b05cc3b1db97fd7ff Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Mon, 12 May 2014 09:12:24 +0100 Subject: [PATCH 1/2] [data] data_resources edited, such that json file is edited directly --- GPy/util/data_resources.json | 441 ++++++++++++++++++++++++++++++++++- 1 file changed, 440 insertions(+), 1 deletion(-) diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index a4a82edd..8f5f06bc 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -1 +1,440 @@ -{"rogers_girolami_data": {"files": [["firstcoursemldata.tar.gz"]], "license": null, "citation": "A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146", "details": "Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.", "urls": ["https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/"], "suffices": [["?dl=1"]], "size": 21949154}, "ankur_pose_data": {"files": [["ankurDataPoseSilhouette.mat"]], "citation": "3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.", "license": null, "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/ankur_pose_data/"], "details": "Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."}, "osu_accad": {"files": [["swagger1TXT.ZIP", "handspring1TXT.ZIP", "quickwalkTXT.ZIP", "run1TXT.ZIP", "sprintTXT.ZIP", "dogwalkTXT.ZIP", "camper_04TXT.ZIP", "dance_KB3_TXT.ZIP", "per20_TXT.ZIP", "perTWO07_TXT.ZIP", "perTWO13_TXT.ZIP", "perTWO14_TXT.ZIP", "perTWO15_TXT.ZIP", "perTWO16_TXT.ZIP"], ["connections.txt"]], "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", "details": "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", "urls": ["http://accad.osu.edu/research/mocap/data/", "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"], "size": 15922790}, "isomap_face_data": {"files": [["face_data.mat"]], "license": null, "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/isomap_face_data/"], "size": 24229368}, "boston_housing": {"files": [["Index", "housing.data", "housing.names"]], "license": null, "citation": "Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.", "details": "The Boston Housing data relates house values in Boston to a range of input variables.", "urls": ["http://archive.ics.uci.edu/ml/machine-learning-databases/housing/"], "size": 51276}, "cmu_mocap_full": {"files": [["allasfamc.zip"]], "license": "From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.", "citation": "Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.'\n 'The database was created with funding from NSF EIA-0196217.", "details": "CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.", "urls": ["http://mocap.cs.cmu.edu"], "size": null}, "brendan_faces": {"files": [["frey_rawface.mat"]], "license": null, "citation": "Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.", "details": "A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.", "urls": ["http://www.cs.nyu.edu/~roweis/data/"], "size": 1100584}, "singlecell": {"files": [["singlecell.csv"]], "license": "ScienceDirect: http://www.elsevier.com/locate/termsandconditions?utm_source=sciencedirect&utm_medium=link&utm_campaign=terms", "citation": "Guoji Guo, Mikael Huss, Guo Qing Tong, Chaoyang Wang, Li Li Sun, Neil D. Clarke, Paul Robson, Resolution of Cell Fate Decisions Revealed by Single-Cell Gene Expression Analysis from Zygote to Blastocyst, Developmental Cell, Volume 18, Issue 4, 20 April 2010, Pages 675-685, ISSN 1534-5807, http://dx.doi.org/10.1016/j.devcel.2010.02.012. (http://www.sciencedirect.com/science/article/pii/S1534580710001103) Keywords: DEVBIO", "details": "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", "urls": ["http://staffwww.dcs.sheffield.ac.uk/people/M.Zwiessele/data/singlecell/"], "size": 233.1}, "olympic_marathon_men": {"files": [["olympicMarathonTimes.csv"]], "license": null, "citation": null, "details": "Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olympic_marathon_men/"], "size": 584}, "pumadyn-32nm": {"files": [["pumadyn-32nm.tar.gz"]], "license": "Data is made available by the Delve system at the University of Toronto", "citation": "Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.", "details": "Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.", "urls": ["ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/"], "size": 5861646}, "ripley_prnn_data": {"files": [["Cushings.dat", "README", "crabs.dat", "fglass.dat", "fglass.grp", "pima.te", "pima.tr", "pima.tr2", "synth.te", "synth.tr", "viruses.dat", "virus3.dat"]], "license": null, "citation": "Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7", "details": "Data sets from Brian Ripley's Pattern Recognition and Neural Networks", "urls": ["http://www.stats.ox.ac.uk/pub/PRNN/"], "size": 93565}, "three_phase_oil_flow": {"files": [["DataTrnLbls.txt", "DataTrn.txt", "DataTst.txt", "DataTstLbls.txt", "DataVdn.txt", "DataVdnLbls.txt"]], "license": null, "citation": "Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593", "details": "The three phase oil data used initially for demonstrating the Generative Topographic mapping.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/three_phase_oil_flow/"], "size": 712796}, "robot_wireless": {"files": [["uw-floor.txt"]], "license": null, "citation": "WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.", "details": "Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/robot_wireless/"], "size": 284390}, "xw_pen": {"files": [["xw_pen_15.csv"]], "license": null, "citation": "Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005", "details": "Accelerometer pen data used for robust regression by Tipping and Lawrence.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/xw_pen/"], "size": 3410}, "swiss_roll": {"files": [["swiss_roll_data.mat"]], "license": null, "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", "urls": ["http://isomap.stanford.edu/"], "size": 800256}, "osu_run1": {"files": [["run1TXT.ZIP"], ["connections.txt"]], "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", "details": "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", "urls": ["http://accad.osu.edu/research/mocap/data/", "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"], "size": 338103}, "creep_rupture": {"files": [["creeprupt.tar"]], "license": null, "citation": "Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.", "details": "Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.", "urls": ["http://www.msm.cam.ac.uk/map/data/tar/"], "size": 602797}, "hapmap3": {"files": [["hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2", "hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2", "relationships_w_pops_121708.txt"]], "license": "International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)", "citation": "Gibbs, Richard A., et al. \"The international HapMap project.\" Nature 426.6968 (2003): 789-796.", "details": "\n HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. \n The HapMap phase three SNP dataset - 1184 samples out of 11 populations.\n See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.\n\n SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:\n Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then\n\n / 1, iff SNPij==(B1,B1)\n Aij = | 0, iff SNPij==(B1,B2)\n \\ -1, iff SNPij==(B2,B2)\n\n The SNP data and the meta information (such as iid, sex and phenotype) are\n stored in the dataframe datadf, index is the Individual ID, \n with following columns for metainfo:\n\n * family_id -> Family ID\n * paternal_id -> Paternal ID\n * maternal_id -> Maternal ID\n * sex -> Sex (1=male; 2=female; other=unknown)\n * phenotype -> Phenotype (-9, or 0 for unknown)\n * population -> Population string (e.g. 'ASW' - 'YRI')\n * rest are SNP rs (ids)\n\n More information is given in infodf:\n\n * Chromosome:\n - autosomal chromosemes -> 1-22\n - X X chromosome -> 23\n - Y Y chromosome -> 24\n - XY Pseudo-autosomal region of X -> 25\n - MT Mitochondrial -> 26\n * Relative Positon (to Chromosome) [base pairs]\n\n ", "urls": ["http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/"], "size": 3458246739}, "olivetti_faces": {"files": [["att_faces.zip"], ["olivettifaces.mat"]], "license": null, "citation": "Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994", "details": "Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. ", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", "http://www.cs.nyu.edu/~roweis/data/"], "size": 8561331}, "della_gatta": {"files": [["DellaGattadata.mat"]], "license": null, "citation": "Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008", "details": "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/della_gatta/"], "size": 3729650}, "epomeo_gpx": {"files": [["endomondo_1.gpx", "endomondo_2.gpx", "garmin_watch_via_endomondo.gpx", "viewranger_phone.gpx", "viewranger_tablet.gpx"]], "license": null, "citation": "", "details": "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/epomeo_gpx/"], "size": 2031872}} \ No newline at end of file +{ + "olivetti_glasses": { + "files": [ + [ + "has_glasses.np" + ], + [ + "olivettifaces.mat" + ] + ], + "license": null, + "citation": "Information recorded in olivetti_faces entry. Should be used from there.", + "details": "Information recorded in olivetti_faces entry. Should be used from there.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", + "http://www.cs.nyu.edu/~roweis/data/" + ], + "size": 4261047 + }, + "boston_housing": { + "files": [ + [ + "Index", + "housing.data", + "housing.names" + ] + ], + "license": null, + "citation": "Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.", + "details": "The Boston Housing data relates house values in Boston to a range of input variables.", + "urls": [ + "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/" + ], + "size": 51276 + }, + "google_trends": { + "files": [ + [] + ], + "license": null, + "citation": "", + "details": "Google trends results.", + "urls": [ + "http://www.google.com/trends/" + ], + "size": 0 + }, + "mauna_loa": { + "files": [ + [ + "co2_mm_mlo.txt" + ] + ], + "license": "-------------------------------------------------------------------- USE OF NOAA ESRL DATA\n\n These data are made freely available to the public and the scientific community in the belief that their wide dissemination will lead to greater understanding and new scientific insights. The availability of these data does not constitute publication of the data. NOAA relies on the ethics and integrity of the user to insure that ESRL receives fair credit for their work. If the data are obtained for potential use in a publication or presentation, ESRL should be informed at the outset of the nature of this work. If the ESRL data are essential to the work, or if an important result or conclusion depends on the ESRL data, co-authorship may be appropriate. This should be discussed at an early stage in the work. Manuscripts using the ESRL data should be sent to ESRL for review before they are submitted for publication so we can insure that the quality and limitations of the data are accurately represented.\n\n Contact: Pieter Tans (303 497 6678; pieter.tans@noaa.gov)\n\n RECIPROCITY Use of these data implies an agreement to reciprocate. Laboratories making similar measurements agree to make their own data available to the general public and to the scientific community in an equally complete and easily accessible form. Modelers are encouraged to make available to the community, upon request, their own tools used in the interpretation of the ESRL data, namely well documented model code, transport fields, and additional information necessary for other scientists to repeat the work and to run modified versions. Model availability includes collaborative support for new users of the models.\n --------------------------------------------------------------------\n\n See www.esrl.noaa.gov/gmd/ccgg/trends/ for additional details.", + "citation": "Mauna Loa Data. Dr. Pieter Tans, NOAA/ESRL (www.esrl.noaa.gov/gmd/ccgg/trends/) and Dr. Ralph Keeling, Scripps Institution of Oceanography (scrippsco2.ucsd.edu/).", + "details": "The 'average' column contains the monthly mean CO2 mole fraction determined from daily averages. The mole fraction of CO2, expressed as parts per million (ppm) is the number of molecules of CO2 in every one million molecules of dried air (water vapor removed). If there are missing days concentrated either early or late in the month, the monthly mean is corrected to the middle of the month using the average seasonal cycle. Missing months are denoted by -99.99. The 'interpolated' column includes average values from the preceding column and interpolated values where data are missing. Interpolated values are computed in two steps. First, we compute for each month the average seasonal cycle in a 7-year window around each monthly value. In this way the seasonal cycle is allowed to change slowly over time. We then determine the 'trend' value for each month by removing the seasonal cycle; this result is shown in the 'trend' column. Trend values are linearly interpolated for missing months. The interpolated monthly mean is then the sum of the average seasonal cycle value and the trend value for the missing month.\n\nNOTE: In general, the data presented for the last year are subject to change, depending on recalibration of the reference gas mixtures used, and other quality control procedures. Occasionally, earlier years may also be changed for the same reasons. Usually these changes are minor.\n\nCO2 expressed as a mole fraction in dry air, micromol/mol, abbreviated as ppm \n\n (-99.99 missing data; -1 no data for daily means in month)", + "urls": [ + "ftp://aftp.cmdl.noaa.gov/products/trends/co2/" + ], + "size": 46779 + }, + "osu_run1": { + "files": [ + [ + "run1TXT.ZIP" + ], + [ + "connections.txt" + ] + ], + "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", + "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", + "details": "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", + "urls": [ + "http://accad.osu.edu/research/mocap/data/", + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/" + ], + "size": 338103 + }, + "swiss_roll": { + "files": [ + [ + "swiss_roll_data.mat" + ] + ], + "license": null, + "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", + "details": "Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", + "urls": [ + "http://isomap.stanford.edu/" + ], + "size": 800256 + }, + "ripley_prnn_data": { + "files": [ + [ + "Cushings.dat", + "README", + "crabs.dat", + "fglass.dat", + "fglass.grp", + "pima.te", + "pima.tr", + "pima.tr2", + "synth.te", + "synth.tr", + "viruses.dat", + "virus3.dat" + ] + ], + "license": null, + "citation": "Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7", + "details": "Data sets from Brian Ripley's Pattern Recognition and Neural Networks", + "urls": [ + "http://www.stats.ox.ac.uk/pub/PRNN/" + ], + "size": 93565 + }, + "rogers_girolami_data": { + "files": [ + [ + "firstcoursemldata.tar.gz" + ] + ], + "license": null, + "citation": "A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146", + "details": "Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.", + "urls": [ + "https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/" + ], + "suffices": [ + [ + "?dl=1" + ] + ], + "size": 21949154 + }, + "singlecell": { + "files": [ + [ + "singlecell.csv" + ] + ], + "license": "ScienceDirect: http://www.elsevier.com/locate/termsandconditions?utm_source=sciencedirect&utm_medium=link&utm_campaign=terms", + "citation": "Guoji Guo, Mikael Huss, Guo Qing Tong, Chaoyang Wang, Li Li Sun, Neil D. Clarke, Paul Robson, Resolution of Cell Fate Decisions Revealed by Single-Cell Gene Expression Analysis from Zygote to Blastocyst, Developmental Cell, Volume 18, Issue 4, 20 April 2010, Pages 675-685, ISSN 1534-5807, http://dx.doi.org/10.1016/j.devcel.2010.02.012. (http://www.sciencedirect.com/science/article/pii/S1534580710001103) Keywords: DEVBIO", + "details": "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", + "urls": [ + "http://staffwww.dcs.sheffield.ac.uk/people/M.Zwiessele/data/singlecell/" + ], + "size": 233.1 + }, + "della_gatta": { + "files": [ + [ + "DellaGattadata.mat" + ] + ], + "license": null, + "citation": "Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008", + "details": "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/della_gatta/" + ], + "size": 3729650 + }, + "creep_rupture": { + "files": [ + [ + "creeprupt.tar" + ] + ], + "license": null, + "citation": "Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.", + "details": "Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.", + "urls": [ + "http://www.msm.cam.ac.uk/map/data/tar/" + ], + "size": 602797 + }, + "olivetti_faces": { + "files": [ + [ + "att_faces.zip" + ], + [ + "olivettifaces.mat" + ] + ], + "license": null, + "citation": "Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994", + "details": "Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. ", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", + "http://www.cs.nyu.edu/~roweis/data/" + ], + "size": 8561331 + }, + "robot_wireless": { + "files": [ + [ + "uw-floor.txt" + ] + ], + "license": null, + "citation": "WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.", + "details": "Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/robot_wireless/" + ], + "size": 284390 + }, + "cmu_mocap_full": { + "files": [ + [ + "allasfamc.zip" + ] + ], + "license": "From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.", + "citation": "Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.\nThe database was created with funding from NSF EIA-0196217.", + "details": "CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.", + "urls": [ + "http://mocap.cs.cmu.edu/subjects" + ], + "size": null + }, + "football_data": { + "files": [ + [ + "E0.csv", + "E1.csv", + "E2.csv", + "E3.csv" + ] + ], + "license": null, + "citation": "", + "details": "Results of English football matches since 1993/94 season.", + "urls": [ + "http://www.football-data.co.uk/mmz4281/" + ], + "size": 1 + }, + "decampos_characters": { + "files": [ + [ + "characters.npy", + "digits.npy" + ] + ], + "license": null, + "citation": "T. de Campos, B. R. Babu, and M. Varma. Character recognition in natural images. VISAPP 2009.", + "details": "Examples of hand written digits taken from the de Campos et al paper on Character Recognition in Natural Images.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/decampos_digits/" + ], + "size": 2031872 + }, + "three_phase_oil_flow": { + "files": [ + [ + "DataTrnLbls.txt", + "DataTrn.txt", + "DataTst.txt", + "DataTstLbls.txt", + "DataVdn.txt", + "DataVdnLbls.txt" + ] + ], + "license": null, + "citation": "Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593", + "details": "The three phase oil data used initially for demonstrating the Generative Topographic mapping.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/three_phase_oil_flow/" + ], + "size": 712796 + }, + "pumadyn-32nm": { + "files": [ + [ + "pumadyn-32nm.tar.gz" + ] + ], + "license": "Data is made available by the Delve system at the University of Toronto", + "citation": "Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.", + "details": "Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.", + "urls": [ + "ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/" + ], + "size": 5861646 + }, + "epomeo_gpx": { + "files": [ + [ + "endomondo_1.gpx", + "endomondo_2.gpx", + "garmin_watch_via_endomondo.gpx", + "viewranger_phone.gpx", + "viewranger_tablet.gpx" + ] + ], + "license": null, + "citation": "", + "details": "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/epomeo_gpx/" + ], + "size": 2031872 + }, + "ankur_pose_data": { + "files": [ + [ + "ankurDataPoseSilhouette.mat" + ] + ], + "license": null, + "citation": "3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.", + "details": "Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/ankur_pose_data/" + ], + "size": 1 + }, + "isomap_face_data": { + "files": [ + [ + "face_data.mat" + ] + ], + "license": null, + "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", + "details": "Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/isomap_face_data/" + ], + "size": 24229368 + }, + "brendan_faces": { + "files": [ + [ + "frey_rawface.mat" + ] + ], + "license": null, + "citation": "Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.", + "details": "A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.", + "urls": [ + "http://www.cs.nyu.edu/~roweis/data/" + ], + "size": 1100584 + }, + "olympic_marathon_men": { + "files": [ + [ + "olympicMarathonTimes.csv" + ] + ], + "license": null, + "citation": null, + "details": "Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olympic_marathon_men/" + ], + "size": 584 + }, + "hapmap3": { + "files": [ + [ + "hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2", + "hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2", + "relationships_w_pops_121708.txt" + ] + ], + "license": "International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)", + "citation": "Gibbs, Richard A., et al. \"The international HapMap project.\" Nature 426.6968 (2003): 789-796.", + "details": "\n HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. \n The HapMap phase three SNP dataset - 1184 samples out of 11 populations.\n See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.\n\n SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:\n Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then\n\n / 1, iff SNPij==(B1,B1)\n Aij = | 0, iff SNPij==(B1,B2)\n \\ -1, iff SNPij==(B2,B2)\n\n The SNP data and the meta information (such as iid, sex and phenotype) are\n stored in the dataframe datadf, index is the Individual ID, \n with following columns for metainfo:\n\n * family_id -> Family ID\n * paternal_id -> Paternal ID\n * maternal_id -> Maternal ID\n * sex -> Sex (1=male; 2=female; other=unknown)\n * phenotype -> Phenotype (-9, or 0 for unknown)\n * population -> Population string (e.g. 'ASW' - 'YRI')\n * rest are SNP rs (ids)\n\n More information is given in infodf:\n\n * Chromosome:\n - autosomal chromosemes -> 1-22\n - X X chromosome -> 23\n - Y Y chromosome -> 24\n - XY Pseudo-autosomal region of X -> 25\n - MT Mitochondrial -> 26\n * Relative Positon (to Chromosome) [base pairs]\n\n ", + "urls": [ + "http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/" + ], + "size": 3458246739 + }, + "boxjenkins_airline": { + "files": [ + [ + "boxjenkins_airline.csv" + ] + ], + "license": "You may copy and redistribute the data. You may make derivative works from the data. You may use the data for commercial purposes. You may not sublicence the data when redistributing it. You may not redistribute the data under a different license. Source attribution on any use of this data: Must refer source.", + "citation": "Box & Jenkins (1976), in file: data/airpass, Description: International airline passengers: monthly totals in thousands. Jan 49 – Dec 60", + "details": "International airline passengers, monthly totals from January 1949 to December 1960.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/boxjenkins_airline/" + ], + "size": 46779 + }, + "osu_accad": { + "files": [ + [ + "swagger1TXT.ZIP", + "handspring1TXT.ZIP", + "quickwalkTXT.ZIP", + "run1TXT.ZIP", + "sprintTXT.ZIP", + "dogwalkTXT.ZIP", + "camper_04TXT.ZIP", + "dance_KB3_TXT.ZIP", + "per20_TXT.ZIP", + "perTWO07_TXT.ZIP", + "perTWO13_TXT.ZIP", + "perTWO14_TXT.ZIP", + "perTWO15_TXT.ZIP", + "perTWO16_TXT.ZIP" + ], + [ + "connections.txt" + ] + ], + "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", + "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", + "details": "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", + "urls": [ + "http://accad.osu.edu/research/mocap/data/", + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/" + ], + "size": 15922790 + }, + "xw_pen": { + "files": [ + [ + "xw_pen_15.csv" + ] + ], + "license": null, + "citation": "Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005", + "details": "Accelerometer pen data used for robust regression by Tipping and Lawrence.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/xw_pen/" + ], + "size": 3410 + } +} From a163bf985e285fdfc0960175e968317689d5062e Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Mon, 12 May 2014 09:13:30 +0100 Subject: [PATCH 2/2] [data] edit json file directly, removed datasets.py and data_resources_create --- GPy/util/datasets.py | 1127 -------------------- GPy/util/datasets/data_resources_create.py | 176 --- 2 files changed, 1303 deletions(-) delete mode 100644 GPy/util/datasets.py delete mode 100644 GPy/util/datasets/data_resources_create.py diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py deleted file mode 100644 index 02c5cdb9..00000000 --- a/GPy/util/datasets.py +++ /dev/null @@ -1,1127 +0,0 @@ -import csv -import os -import copy -import numpy as np -import pylab as pb -import GPy -import scipy.io -import cPickle as pickle -import zipfile -import tarfile -import datetime -import json -import re - -ipython_available=True -try: - import IPython -except ImportError: - ipython_available=False - - -import sys, urllib2 - -def reporthook(a,b,c): - # ',' at the end of the line is important! - #print "% 3.1f%% of %d bytes\r" % (min(100, float(a * b) / c * 100), c), - #you can also use sys.stdout.write - sys.stdout.write("\r% 3.1f%% of %d bytes" % (min(100, float(a * b) / c * 100), c)) - sys.stdout.flush() - -# Global variables -data_path = os.path.join(os.path.dirname(__file__), 'datasets') -default_seed = 10000 -overide_manual_authorize=False -neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/' - -# Read data resources from json file. -# Don't do this when ReadTheDocs is scanning as it breaks things -on_rtd = os.environ.get('READTHEDOCS', None) == 'True' #Checks if RTD is scanning - -if not (on_rtd): - path = os.path.join(os.path.dirname(__file__), 'data_resources.json') - json_data=open(path).read() - data_resources = json.loads(json_data) - -if not (on_rtd): - path = os.path.join(os.path.dirname(__file__), 'football_teams.json') - json_data=open(path).read() - football_dict = json.loads(json_data) - - - -def prompt_user(prompt): - """Ask user for agreeing to data set licenses.""" - # raw_input returns the empty string for "enter" - yes = set(['yes', 'y']) - no = set(['no','n']) - - try: - print(prompt) - choice = raw_input().lower() - # would like to test for exception here, but not sure if we can do that without importing IPython - except: - print('Stdin is not implemented.') - print('You need to set') - print('overide_manual_authorize=True') - print('to proceed with the download. Please set that variable and continue.') - raise - - - if choice in yes: - return True - elif choice in no: - return False - else: - print("Your response was a " + choice) - print("Please respond with 'yes', 'y' or 'no', 'n'") - #return prompt_user() - - -def data_available(dataset_name=None): - """Check if the data set is available on the local machine already.""" - for file_list in data_resources[dataset_name]['files']: - for file in file_list: - if not os.path.exists(os.path.join(data_path, dataset_name, file)): - return False - return True - -def download_url(url, store_directory, save_name = None, messages = True, suffix=''): - """Download a file from a url and save it to disk.""" - i = url.rfind('/') - file = url[i+1:] - print file - dir_name = os.path.join(data_path, store_directory) - save_name = os.path.join(dir_name, file) - print "Downloading ", url, "->", os.path.join(store_directory, file) - if not os.path.exists(dir_name): - os.makedirs(dir_name) - try: - response = urllib2.urlopen(url+suffix) - except urllib2.URLError, e: - if not hasattr(e, "code"): - raise - response = e - if response.code > 399 and response.code<500: - raise ValueError('Tried url ' + url + suffix + ' and received client error ' + str(response.code)) - elif response.code > 499: - raise ValueError('Tried url ' + url + suffix + ' and received server error ' + str(response.code)) - with open(save_name, 'wb') as f: - meta = response.info() - file_size = int(meta.getheaders("Content-Length")[0]) - status = "" - file_size_dl = 0 - block_sz = 8192 - line_length=30 - while True: - buff = response.read(block_sz) - if not buff: - break - file_size_dl += len(buff) - f.write(buff) - sys.stdout.write(" "*(len(status)) + "\r") - status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1.*1e6), - full=file_size/(1.*1e6), ll=line_length, - perc="="*int(line_length*float(file_size_dl)/file_size)) - sys.stdout.write(status) - sys.stdout.flush() - sys.stdout.write(" "*(len(status)) + "\r") - print status - # if we wanted to get more sophisticated maybe we should check the response code here again even for successes. - #with open(save_name, 'wb') as f: - # f.write(response.read()) - - #urllib.urlretrieve(url+suffix, save_name, reporthook) - -def authorize_download(dataset_name=None): - """Check with the user that the are happy with terms and conditions for the data set.""" - print('Acquiring resource: ' + dataset_name) - # TODO, check resource is in dictionary! - print('') - dr = data_resources[dataset_name] - print('Details of data: ') - print(dr['details']) - print('') - if dr['citation']: - print('Please cite:') - print(dr['citation']) - print('') - if dr['size']: - print('After downloading the data will take up ' + str(dr['size']) + ' bytes of space.') - print('') - print('Data will be stored in ' + os.path.join(data_path, dataset_name) + '.') - print('') - if overide_manual_authorize: - if dr['license']: - print('You have agreed to the following license:') - print(dr['license']) - print('') - return True - else: - if dr['license']: - print('You must also agree to the following license:') - print(dr['license']) - print('') - return prompt_user('Do you wish to proceed with the download? [yes/no]') - -def download_data(dataset_name=None): - """Check with the user that the are happy with terms and conditions for the data set, then download it.""" - - dr = data_resources[dataset_name] - if not authorize_download(dataset_name): - raise Exception("Permission to download data set denied.") - - if dr.has_key('suffices'): - for url, files, suffices in zip(dr['urls'], dr['files'], dr['suffices']): - for file, suffix in zip(files, suffices): - download_url(os.path.join(url,file), dataset_name, dataset_name, suffix=suffix) - else: - for url, files in zip(dr['urls'], dr['files']): - for file in files: - download_url(os.path.join(url,file), dataset_name, dataset_name) - return True - -def data_details_return(data, data_set): - """Update the data component of the data dictionary with details drawn from the data_resources.""" - data.update(data_resources[data_set]) - return data - - -def cmu_urls_files(subj_motions, messages = True): - ''' - Find which resources are missing on the local disk for the requested CMU motion capture motions. - ''' - dr = data_resources['cmu_mocap_full'] - cmu_url = dr['urls'][0] - - subjects_num = subj_motions[0] - motions_num = subj_motions[1] - - resource = {'urls' : [], 'files' : []} - # Convert numbers to strings - subjects = [] - motions = [list() for _ in range(len(subjects_num))] - for i in range(len(subjects_num)): - curSubj = str(int(subjects_num[i])) - if int(subjects_num[i]) < 10: - curSubj = '0' + curSubj - subjects.append(curSubj) - for j in range(len(motions_num[i])): - curMot = str(int(motions_num[i][j])) - if int(motions_num[i][j]) < 10: - curMot = '0' + curMot - motions[i].append(curMot) - - all_skels = [] - - assert len(subjects) == len(motions) - - all_motions = [] - - for i in range(len(subjects)): - skel_dir = os.path.join(data_path, 'cmu_mocap') - cur_skel_file = os.path.join(skel_dir, subjects[i] + '.asf') - - url_required = False - file_download = [] - if not os.path.exists(cur_skel_file): - # Current skel file doesn't exist. - if not os.path.isdir(skel_dir): - os.mkdir(skel_dir) - # Add skel file to list. - url_required = True - file_download.append(subjects[i] + '.asf') - for j in range(len(motions[i])): - file_name = subjects[i] + '_' + motions[i][j] + '.amc' - cur_motion_file = os.path.join(skel_dir, file_name) - if not os.path.exists(cur_motion_file): - url_required = True - file_download.append(subjects[i] + '_' + motions[i][j] + '.amc') - if url_required: - resource['urls'].append(cmu_url + '/' + subjects[i] + '/') - resource['files'].append(file_download) - return resource - -try: - import gpxpy - import gpxpy.gpx - gpxpy_available = True - -except ImportError: - gpxpy_available = False - -if gpxpy_available: - def epomeo_gpx(data_set='epomeo_gpx', sample_every=4): - if not data_available(data_set): - download_data(data_set) - files = ['endomondo_1', 'endomondo_2', 'garmin_watch_via_endomondo','viewranger_phone', 'viewranger_tablet'] - - X = [] - for file in files: - gpx_file = open(os.path.join(data_path, 'epomeo_gpx', file + '.gpx'), 'r') - - gpx = gpxpy.parse(gpx_file) - segment = gpx.tracks[0].segments[0] - points = [point for track in gpx.tracks for segment in track.segments for point in segment.points] - data = [[(point.time-datetime.datetime(2013,8,21)).total_seconds(), point.latitude, point.longitude, point.elevation] for point in points] - X.append(np.asarray(data)[::sample_every, :]) - gpx_file.close() - return data_details_return({'X' : X, 'info' : 'Data is an array containing time in seconds, latitude, longitude and elevation in that order.'}, data_set) - -#del gpxpy_available - - - -# Some general utilities. -def sample_class(f): - p = 1. / (1. + np.exp(-f)) - c = np.random.binomial(1, p) - c = np.where(c, 1, -1) - return c - -def boston_housing(data_set='boston_housing'): - if not data_available(data_set): - download_data(data_set) - all_data = np.genfromtxt(os.path.join(data_path, data_set, 'housing.data')) - X = all_data[:, 0:13] - Y = all_data[:, 13:14] - return data_details_return({'X' : X, 'Y': Y}, data_set) - -def brendan_faces(data_set='brendan_faces'): - if not data_available(data_set): - download_data(data_set) - mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'frey_rawface.mat')) - Y = mat_data['ff'].T - return data_details_return({'Y': Y}, data_set) - -def della_gatta_TRP63_gene_expression(data_set='della_gatta', gene_number=None): - if not data_available(data_set): - download_data(data_set) - mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'DellaGattadata.mat')) - X = np.double(mat_data['timepoints']) - if gene_number == None: - Y = mat_data['exprs_tp53_RMA'] - else: - Y = mat_data['exprs_tp53_RMA'][:, gene_number] - if len(Y.shape) == 1: - Y = Y[:, None] - return data_details_return({'X': X, 'Y': Y, 'gene_number' : gene_number}, data_set) - - - -def football_data(season='1314', data_set='football_data'): - """Football data from English games since 1993. This downloads data from football-data.co.uk for the given season. """ - def league2num(string): - league_dict = {'E0':0, 'E1':1, 'E2': 2, 'E3': 3, 'EC':4} - return league_dict[string] - - def football2num(string): - if football_dict.has_key(string): - return football_dict[string] - else: - football_dict[string] = len(football_dict)+1 - return len(football_dict)+1 - - data_set_season = data_set + '_' + season - data_resources[data_set_season] = copy.deepcopy(data_resources[data_set]) - data_resources[data_set_season]['urls'][0]+=season + '/' - start_year = int(season[0:2]) - end_year = int(season[2:4]) - files = ['E0.csv', 'E1.csv', 'E2.csv', 'E3.csv'] - if start_year>4 and start_year < 93: - files += ['EC.csv'] - data_resources[data_set_season]['files'] = [files] - if not data_available(data_set_season): - download_data(data_set_season) - for file in reversed(files): - filename = os.path.join(data_path, data_set_season, file) - # rewrite files removing blank rows. - writename = os.path.join(data_path, data_set_season, 'temp.csv') - input = open(filename, 'rb') - output = open(writename, 'wb') - writer = csv.writer(output) - for row in csv.reader(input): - if any(field.strip() for field in row): - writer.writerow(row) - input.close() - output.close() - table = np.loadtxt(writename,skiprows=1, usecols=(0, 1, 2, 3, 4, 5), converters = {0: league2num, 1: pb.datestr2num, 2:football2num, 3:football2num}, delimiter=',') - X = table[:, :4] - Y = table[:, 4:] - return data_details_return({'X': X, 'Y': Y}, data_set) - -# This will be for downloading google trends data. -def google_trends(query_terms=['big data', 'machine learning', 'data science'], data_set='google_trends'): - """Data downloaded from Google trends for given query terms. Warning, if you use this function multiple times in a row you get blocked due to terms of service violations.""" - # Inspired by this notebook: - # http://nbviewer.ipython.org/github/sahuguet/notebooks/blob/master/GoogleTrends%20meet%20Notebook.ipynb - - # quote the query terms. - for i, element in enumerate(query_terms): - query_terms[i] = urllib2.quote(element) - query = 'http://www.google.com/trends/fetchComponent?q=%s&cid=TIMESERIES_GRAPH_0&export=3' % ",".join(query_terms) - - data = urllib2.urlopen(query).read() - - # In the notebook they did some data cleaning: remove Javascript header+footer, and translate new Date(....,..,..) into YYYY-MM-DD. - header = """// Data table response\ngoogle.visualization.Query.setResponse(""" - data = data[len(header):-2] - data = re.sub('new Date\((\d+),(\d+),(\d+)\)', (lambda m: '"%s-%02d-%02d"' % (m.group(1).strip(), 1+int(m.group(2)), int(m.group(3)))), data) - timeseries = json.loads(data) - #import pandas as pd - columns = [k['label'] for k in timeseries['table']['cols']] - rows = map(lambda x: [k['v'] for k in x['c']], timeseries['table']['rows']) - terms = len(columns)-1 - X = np.asarray([(pb.datestr2num(row[0]), i) for i in range(terms) for row in rows ]) - Y = np.asarray([[row[i+1]] for i in range(terms) for row in rows ]) - output_info = columns[1:] - return data_details_return({'X': X, 'Y': Y, 'query_terms': output_info, 'info': "Data downloaded from google trends with query terms: " + ', '.join(output_info) + '.'}, data_set) - -# The data sets -def oil(data_set='three_phase_oil_flow'): - """The three phase oil data from Bishop and James (1993).""" - if not data_available(data_set): - download_data(data_set) - oil_train_file = os.path.join(data_path, data_set, 'DataTrn.txt') - oil_trainlbls_file = os.path.join(data_path, data_set, 'DataTrnLbls.txt') - oil_test_file = os.path.join(data_path, data_set, 'DataTst.txt') - oil_testlbls_file = os.path.join(data_path, data_set, 'DataTstLbls.txt') - oil_valid_file = os.path.join(data_path, data_set, 'DataVdn.txt') - oil_validlbls_file = os.path.join(data_path, data_set, 'DataVdnLbls.txt') - fid = open(oil_train_file) - X = np.fromfile(fid, sep='\t').reshape((-1, 12)) - fid.close() - fid = open(oil_test_file) - Xtest = np.fromfile(fid, sep='\t').reshape((-1, 12)) - fid.close() - fid = open(oil_valid_file) - Xvalid = np.fromfile(fid, sep='\t').reshape((-1, 12)) - fid.close() - fid = open(oil_trainlbls_file) - Y = np.fromfile(fid, sep='\t').reshape((-1, 3)) * 2. - 1. - fid.close() - fid = open(oil_testlbls_file) - Ytest = np.fromfile(fid, sep='\t').reshape((-1, 3)) * 2. - 1. - fid.close() - fid = open(oil_validlbls_file) - Yvalid = np.fromfile(fid, sep='\t').reshape((-1, 3)) * 2. - 1. - fid.close() - return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'Xtest' : Xtest, 'Xvalid': Xvalid, 'Yvalid': Yvalid}, data_set) - #else: - # throw an error - -def oil_100(seed=default_seed, data_set = 'three_phase_oil_flow'): - np.random.seed(seed=seed) - data = oil() - indices = np.random.permutation(1000) - indices = indices[0:100] - X = data['X'][indices, :] - Y = data['Y'][indices, :] - return data_details_return({'X': X, 'Y': Y, 'info': "Subsample of the full oil data extracting 100 values randomly without replacement, here seed was " + str(seed)}, data_set) - -def pumadyn(seed=default_seed, data_set='pumadyn-32nm'): - if not data_available(data_set): - download_data(data_set) - path = os.path.join(data_path, data_set) - tar = tarfile.open(os.path.join(path, 'pumadyn-32nm.tar.gz')) - print('Extracting file.') - tar.extractall(path=path) - tar.close() - # Data is variance 1, no need to normalize. - data = np.loadtxt(os.path.join(data_path, data_set, 'pumadyn-32nm', 'Dataset.data.gz')) - indices = np.random.permutation(data.shape[0]) - indicesTrain = indices[0:7168] - indicesTest = indices[7168:-1] - indicesTrain.sort(axis=0) - indicesTest.sort(axis=0) - X = data[indicesTrain, 0:-2] - Y = data[indicesTrain, -1][:, None] - Xtest = data[indicesTest, 0:-2] - Ytest = data[indicesTest, -1][:, None] - return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'seed': seed}, data_set) - -def robot_wireless(data_set='robot_wireless'): - # WiFi access point strengths on a tour around UW Paul Allen building. - if not data_available(data_set): - download_data(data_set) - file_name = os.path.join(data_path, data_set, 'uw-floor.txt') - all_time = np.genfromtxt(file_name, usecols=(0)) - macaddress = np.genfromtxt(file_name, usecols=(1), dtype='string') - x = np.genfromtxt(file_name, usecols=(2)) - y = np.genfromtxt(file_name, usecols=(3)) - strength = np.genfromtxt(file_name, usecols=(4)) - addresses = np.unique(macaddress) - times = np.unique(all_time) - addresses.sort() - times.sort() - allY = np.zeros((len(times), len(addresses))) - allX = np.zeros((len(times), 2)) - allY[:]=-92. - strengths={} - for address, j in zip(addresses, range(len(addresses))): - ind = np.nonzero(address==macaddress) - temp_strengths=strength[ind] - temp_x=x[ind] - temp_y=y[ind] - temp_times = all_time[ind] - for time in temp_times: - vals = time==temp_times - if any(vals): - ind2 = np.nonzero(vals) - i = np.nonzero(time==times) - allY[i, j] = temp_strengths[ind2] - allX[i, 0] = temp_x[ind2] - allX[i, 1] = temp_y[ind2] - allY = (allY + 85.)/15. - - X = allX[0:215, :] - Y = allY[0:215, :] - - Xtest = allX[215:, :] - Ytest = allY[215:, :] - return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'addresses' : addresses, 'times' : times}, data_set) - -def silhouette(data_set='ankur_pose_data'): - # Ankur Agarwal and Bill Trigg's silhoutte data. - if not data_available(data_set): - download_data(data_set) - mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'ankurDataPoseSilhouette.mat')) - inMean = np.mean(mat_data['Y']) - inScales = np.sqrt(np.var(mat_data['Y'])) - X = mat_data['Y'] - inMean - X = X / inScales - Xtest = mat_data['Y_test'] - inMean - Xtest = Xtest / inScales - Y = mat_data['Z'] - Ytest = mat_data['Z_test'] - return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest}, data_set) - -def decampos_digits(data_set='decampos_characters', which_digits=[0,1,2,3,4,5,6,7,8,9]): - if not data_available(data_set): - download_data(data_set) - path = os.path.join(data_path, data_set) - digits = np.load(os.path.join(path, 'digits.npy')) - digits = digits[which_digits,:,:,:] - num_classes, num_samples, height, width = digits.shape - Y = digits.reshape((digits.shape[0]*digits.shape[1],digits.shape[2]*digits.shape[3])) - lbls = np.array([[l]*num_samples for l in which_digits]).reshape(Y.shape[0], 1) - str_lbls = np.array([[str(l)]*num_samples for l in which_digits]) - return data_details_return({'Y': Y, 'lbls': lbls, 'str_lbls' : str_lbls, 'info': 'Digits data set from the de Campos characters data'}, data_set) - -def ripley_synth(data_set='ripley_prnn_data'): - if not data_available(data_set): - download_data(data_set) - train = np.genfromtxt(os.path.join(data_path, data_set, 'synth.tr'), skip_header=1) - X = train[:, 0:2] - y = train[:, 2:3] - test = np.genfromtxt(os.path.join(data_path, data_set, 'synth.te'), skip_header=1) - Xtest = test[:, 0:2] - ytest = test[:, 2:3] - return data_details_return({'X': X, 'Y': y, 'Xtest': Xtest, 'Ytest': ytest, 'info': 'Synthetic data generated by Ripley for a two class classification problem.'}, data_set) - -def mauna_loa(data_set='mauna_loa', num_train=543, refresh_data=False): - path = os.path.join(data_path, data_set) - if data_available(data_set) and not refresh_data: - print 'Using cached version of the data set, to use latest version set refresh_data to True' - else: - download_data(data_set) - data = np.loadtxt(os.path.join(data_path, data_set, 'co2_mm_mlo.txt')) - print 'Most recent data observation from month ', data[-1, 1], ' in year ', data[-1, 0] - allX = data[data[:, 3]!=-99.99, 2:3] - allY = data[data[:, 3]!=-99.99, 3:4] - X = allX[:num_train, 0:1] - Xtest = allX[num_train:, 0:1] - Y = allY[:num_train, 0:1] - Ytest = allY[num_train:, 0:1] - return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "Mauna Loa data with " + str(num_train) + " values used as training points."}, data_set) - - -def boxjenkins_airline(data_set='boxjenkins_airline', num_train=96): - path = os.path.join(data_path, data_set) - if not data_available(data_set): - download_data(data_set) - data = np.loadtxt(os.path.join(data_path, data_set, 'boxjenkins_airline.csv'), delimiter=',') - Y = data[:num_train, 1:2] - X = data[:num_train, 0:1] - Xtest = data[num_train:, 0:1] - Ytest = data[num_train:, 1:2] - return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "Montly airline passenger data from Box & Jenkins 1976."}, data_set) - - -def osu_run1(data_set='osu_run1', sample_every=4): - path = os.path.join(data_path, data_set) - if not data_available(data_set): - download_data(data_set) - zip = zipfile.ZipFile(os.path.join(data_path, data_set, 'run1TXT.ZIP'), 'r') - for name in zip.namelist(): - zip.extract(name, path) - Y, connect = GPy.util.mocap.load_text_data('Aug210106', path) - Y = Y[0:-1:sample_every, :] - return data_details_return({'Y': Y, 'connect' : connect}, data_set) - -def swiss_roll_generated(num_samples=1000, sigma=0.0): - with open(os.path.join(data_path, 'swiss_roll.pickle')) as f: - data = pickle.load(f) - Na = data['Y'].shape[0] - perm = np.random.permutation(np.r_[:Na])[:num_samples] - Y = data['Y'][perm, :] - t = data['t'][perm] - c = data['colors'][perm, :] - so = np.argsort(t) - Y = Y[so, :] - t = t[so] - c = c[so, :] - return {'Y':Y, 't':t, 'colors':c} - -def hapmap3(data_set='hapmap3'): - """ - The HapMap phase three SNP dataset - 1184 samples out of 11 populations. - - SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]: - Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then - - / 1, iff SNPij==(B1,B1) - Aij = | 0, iff SNPij==(B1,B2) - \ -1, iff SNPij==(B2,B2) - - The SNP data and the meta information (such as iid, sex and phenotype) are - stored in the dataframe datadf, index is the Individual ID, - with following columns for metainfo: - - * family_id -> Family ID - * paternal_id -> Paternal ID - * maternal_id -> Maternal ID - * sex -> Sex (1=male; 2=female; other=unknown) - * phenotype -> Phenotype (-9, or 0 for unknown) - * population -> Population string (e.g. 'ASW' - 'YRI') - * rest are SNP rs (ids) - - More information is given in infodf: - - * Chromosome: - - autosomal chromosemes -> 1-22 - - X X chromosome -> 23 - - Y Y chromosome -> 24 - - XY Pseudo-autosomal region of X -> 25 - - MT Mitochondrial -> 26 - * Relative Positon (to Chromosome) [base pairs] - """ - try: - from pandas import read_pickle, DataFrame - from sys import stdout - import bz2 - except ImportError as i: - raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset" - if not data_available(data_set): - download_data(data_set) - dirpath = os.path.join(data_path,'hapmap3') - hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly' - preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \ - ['.snps.pickle', - '.info.pickle', - '.nan.pickle']] - if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)): - if not overide_manual_authorize and not prompt_user("Preprocessing requires ~25GB " - "of memory and can take a (very) long time, continue? [Y/n]"): - print "Preprocessing required for further usage." - return - status = "Preprocessing data, please be patient..." - print status - def write_status(message, progress, status): - stdout.write(" "*len(status)); stdout.write("\r"); stdout.flush() - status = r"[{perc: <{ll}}] {message: <13s}".format(message=message, ll=20, - perc="="*int(20.*progress/100.)) - stdout.write(status); stdout.flush() - return status - unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']] - if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)): - status=write_status('unpacking...', 0, '') - curr = 0 - for newfilepath in unpacked_files: - if not os.path.exists(newfilepath): - filepath = newfilepath + '.bz2' - file_size = os.path.getsize(filepath) - with open(newfilepath, 'wb') as new_file, open(filepath, 'rb') as f: - decomp = bz2.BZ2Decompressor() - file_processed = 0 - buffsize = 100 * 1024 - for data in iter(lambda : f.read(buffsize), b''): - new_file.write(decomp.decompress(data)) - file_processed += len(data) - status=write_status('unpacking...', curr+12.*file_processed/(file_size), status) - curr += 12 - status=write_status('unpacking...', curr, status) - status=write_status('reading .ped...', 25, status) - # Preprocess data: - snpstrnp = np.loadtxt(unpacked_files[0], dtype=str) - status=write_status('reading .map...', 33, status) - mapnp = np.loadtxt(unpacked_files[1], dtype=str) - status=write_status('reading relationships.txt...', 42, status) - # and metainfo: - infodf = DataFrame.from_csv(os.path.join(dirpath,'./relationships_w_pops_121708.txt'), header=0, sep='\t') - infodf.set_index('IID', inplace=1) - status=write_status('filtering nan...', 45, status) - snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2) - inan = snpstr[:,:,0] == '0' - status=write_status('filtering reference alleles...', 55, status) - ref = np.array(map(lambda x: np.unique(x)[-2:], snpstr.swapaxes(0,1)[:,:,:])) - status=write_status('encoding snps...', 70, status) - # Encode the information for each gene in {-1,0,1}: - status=write_status('encoding snps...', 73, status) - snps = (snpstr==ref[None,:,:]) - status=write_status('encoding snps...', 76, status) - snps = (snps*np.array([1,-1])[None,None,:]) - status=write_status('encoding snps...', 78, status) - snps = snps.sum(-1) - status=write_status('encoding snps...', 81, status) - snps = snps.astype('i8') - status=write_status('marking nan values...', 88, status) - # put in nan values (masked as -128): - snps[inan] = -128 - status=write_status('setting up meta...', 94, status) - # get meta information: - metaheader = np.r_[['family_id', 'iid', 'paternal_id', 'maternal_id', 'sex', 'phenotype']] - metadf = DataFrame(columns=metaheader, data=snpstrnp[:,:6]) - metadf.set_index('iid', inplace=1) - metadf = metadf.join(infodf.population) - metadf.to_pickle(preprocessed_data_paths[1]) - # put everything together: - status=write_status('setting up snps...', 96, status) - snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:,1]) - with open(preprocessed_data_paths[0], 'wb') as f: - pickle.dump(f, snpsdf, protocoll=-1) - status=write_status('setting up snps...', 98, status) - inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:,1]) - inandf.to_pickle(preprocessed_data_paths[2]) - status=write_status('done :)', 100, status) - print '' - else: - print "loading snps..." - snpsdf = read_pickle(preprocessed_data_paths[0]) - print "loading metainfo..." - metadf = read_pickle(preprocessed_data_paths[1]) - print "loading nan entries..." - inandf = read_pickle(preprocessed_data_paths[2]) - snps = snpsdf.values - populations = metadf.population.values.astype('S3') - hapmap = dict(name=data_set, - description='The HapMap phase three SNP dataset - ' - '1184 samples out of 11 populations. inan is a ' - 'boolean array, containing wheather or not the ' - 'given entry is nan (nans are masked as ' - '-128 in snps).', - snpsdf=snpsdf, - metadf=metadf, - snps=snps, - inan=inandf.values, - inandf=inandf, - populations=populations) - return hapmap - -def singlecell(data_set='singlecell'): - if not data_available(data_set): - download_data(data_set) - dirpath = os.path.join(data_path, data_set) - data = np.loadtxt(os.path.join(dirpath, 'singlecell.csv'), delimiter=",", dtype=str) - genes = data[0, 1:] - labels = data[1:, 0] - Y = np.array(data[1:, 1:], dtype=float) - return data_details_return({'Y': Y, 'info' : "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", - 'genes':genes, 'labels':labels, - }, data_set) - -def swiss_roll_1000(): - return swiss_roll(num_samples=1000) - -def swiss_roll(num_samples=3000, data_set='swiss_roll'): - if not data_available(data_set): - download_data(data_set) - mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'swiss_roll_data.mat')) - Y = mat_data['X_data'][:, 0:num_samples].transpose() - return data_details_return({'Y': Y, 'X': mat_data['X_data'], 'info': "The first " + str(num_samples) + " points from the swiss roll data of Tennenbaum, de Silva and Langford (2001)."}, data_set) - -def isomap_faces(num_samples=698, data_set='isomap_face_data'): - if not data_available(data_set): - download_data(data_set) - mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'face_data.mat')) - Y = mat_data['images'][:, 0:num_samples].transpose() - return data_details_return({'Y': Y, 'poses' : mat_data['poses'], 'lights': mat_data['lights'], 'info': "The first " + str(num_samples) + " points from the face data of Tennenbaum, de Silva and Langford (2001)."}, data_set) - -def simulation_BGPLVM(): - mat_data = scipy.io.loadmat(os.path.join(data_path, 'BGPLVMSimulation.mat')) - Y = np.array(mat_data['Y'], dtype=float) - S = np.array(mat_data['initS'], dtype=float) - mu = np.array(mat_data['initMu'], dtype=float) - #return data_details_return({'S': S, 'Y': Y, 'mu': mu}, data_set) - return {'Y': Y, 'S': S, - 'mu' : mu, - 'info': "Simulated test dataset generated in MATLAB to compare BGPLVM between python and MATLAB"} - -def toy_rbf_1d(seed=default_seed, num_samples=500): - """ - Samples values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1. - - :param seed: seed to use for random sampling. - :type seed: int - :param num_samples: number of samples to sample in the function (default 500). - :type num_samples: int - - """ - np.random.seed(seed=seed) - num_in = 1 - X = np.random.uniform(low= -1.0, high=1.0, size=(num_samples, num_in)) - X.sort(axis=0) - rbf = GPy.kern.RBF(num_in, variance=1., lengthscale=np.array((0.25,))) - white = GPy.kern.White(num_in, variance=1e-2) - kernel = rbf + white - K = kernel.K(X) - y = np.reshape(np.random.multivariate_normal(np.zeros(num_samples), K), (num_samples, 1)) - return {'X':X, 'Y':y, 'info': "Sampled " + str(num_samples) + " values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1."} - -def toy_rbf_1d_50(seed=default_seed): - np.random.seed(seed=seed) - data = toy_rbf_1d() - indices = np.random.permutation(data['X'].shape[0]) - indices = indices[0:50] - indices.sort(axis=0) - X = data['X'][indices, :] - Y = data['Y'][indices, :] - return {'X': X, 'Y': Y, 'info': "Subsamples the toy_rbf_sample with 50 values randomly taken from the original sample.", 'seed' : seed} - - -def toy_linear_1d_classification(seed=default_seed): - np.random.seed(seed=seed) - x1 = np.random.normal(-3, 5, 20) - x2 = np.random.normal(3, 5, 20) - X = (np.r_[x1, x2])[:, None] - return {'X': X, 'Y': sample_class(2.*X), 'F': 2.*X, 'seed' : seed} - -def olivetti_glasses(data_set='olivetti_glasses', num_training=200, seed=default_seed): - path = os.path.join(data_path, data_set) - if not data_available(data_set): - download_data(data_set) - y = np.load(os.path.join(path, 'has_glasses.np')) - y = np.where(y=='y',1,0).reshape(-1,1) - faces = scipy.io.loadmat(os.path.join(path, 'olivettifaces.mat'))['faces'].T - np.random.seed(seed=seed) - index = np.random.permutation(faces.shape[0]) - X = faces[index[:num_training],:] - Xtest = faces[index[num_training:],:] - Y = y[index[:num_training],:] - Ytest = y[index[num_training:]] - return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'seed' : seed, 'info': "ORL Faces with labels identifiying who is wearing glasses and who isn't. Data is randomly partitioned according to given seed. Presence or absence of glasses was labelled by James Hensman."}, 'olivetti_faces') - -def olivetti_faces(data_set='olivetti_faces'): - path = os.path.join(data_path, data_set) - if not data_available(data_set): - download_data(data_set) - zip = zipfile.ZipFile(os.path.join(path, 'att_faces.zip'), 'r') - for name in zip.namelist(): - zip.extract(name, path) - Y = [] - lbls = [] - for subject in range(40): - for image in range(10): - image_path = os.path.join(path, 'orl_faces', 's'+str(subject+1), str(image+1) + '.pgm') - Y.append(GPy.util.netpbmfile.imread(image_path).flatten()) - lbls.append(subject) - Y = np.asarray(Y) - lbls = np.asarray(lbls)[:, None] - return data_details_return({'Y': Y, 'lbls' : lbls, 'info': "ORL Faces processed to 64x64 images."}, data_set) - -def xw_pen(data_set='xw_pen'): - if not data_available(data_set): - download_data(data_set) - Y = np.loadtxt(os.path.join(data_path, data_set, 'xw_pen_15.csv'), delimiter=',') - X = np.arange(485)[:, None] - return data_details_return({'Y': Y, 'X': X, 'info': "Tilt data from a personalized digital assistant pen. Plot in original paper showed regression between time steps 175 and 275."}, data_set) - - -def download_rogers_girolami_data(data_set='rogers_girolami_data'): - if not data_available('rogers_girolami_data'): - download_data(data_set) - path = os.path.join(data_path, data_set) - tar_file = os.path.join(path, 'firstcoursemldata.tar.gz') - tar = tarfile.open(tar_file) - print('Extracting file.') - tar.extractall(path=path) - tar.close() - -def olympic_100m_men(data_set='rogers_girolami_data'): - download_rogers_girolami_data() - olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['male100'] - - X = olympic_data[:, 0][:, None] - Y = olympic_data[:, 1][:, None] - return data_details_return({'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m men from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}, data_set) - -def olympic_100m_women(data_set='rogers_girolami_data'): - download_rogers_girolami_data() - olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['female100'] - - X = olympic_data[:, 0][:, None] - Y = olympic_data[:, 1][:, None] - return data_details_return({'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m women from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}, data_set) - -def olympic_200m_women(data_set='rogers_girolami_data'): - download_rogers_girolami_data() - olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['female200'] - - X = olympic_data[:, 0][:, None] - Y = olympic_data[:, 1][:, None] - return data_details_return({'X': X, 'Y': Y, 'info': "Olympic 200 m winning times for women from 1896 until 2008. Data is from Rogers and Girolami's First Course in Machine Learning."}, data_set) - -def olympic_200m_men(data_set='rogers_girolami_data'): - download_rogers_girolami_data() - olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['male200'] - - X = olympic_data[:, 0][:, None] - Y = olympic_data[:, 1][:, None] - return data_details_return({'X': X, 'Y': Y, 'info': "Male 200 m winning times for women from 1896 until 2008. Data is from Rogers and Girolami's First Course in Machine Learning."}, data_set) - -def olympic_400m_women(data_set='rogers_girolami_data'): - download_rogers_girolami_data() - olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['female400'] - - X = olympic_data[:, 0][:, None] - Y = olympic_data[:, 1][:, None] - return data_details_return({'X': X, 'Y': Y, 'info': "Olympic 400 m winning times for women until 2008. Data is from Rogers and Girolami's First Course in Machine Learning."}, data_set) - -def olympic_400m_men(data_set='rogers_girolami_data'): - download_rogers_girolami_data() - olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['male400'] - - X = olympic_data[:, 0][:, None] - Y = olympic_data[:, 1][:, None] - return data_details_return({'X': X, 'Y': Y, 'info': "Male 400 m winning times for women until 2008. Data is from Rogers and Girolami's First Course in Machine Learning."}, data_set) - -def olympic_marathon_men(data_set='olympic_marathon_men'): - if not data_available(data_set): - download_data(data_set) - olympics = np.genfromtxt(os.path.join(data_path, data_set, 'olympicMarathonTimes.csv'), delimiter=',') - X = olympics[:, 0:1] - Y = olympics[:, 1:2] - return data_details_return({'X': X, 'Y': Y}, data_set) - -def olympic_sprints(data_set='rogers_girolami_data'): - """All olympics sprint winning times for multiple output prediction.""" - X = np.zeros((0, 2)) - Y = np.zeros((0, 1)) - for i, dataset in enumerate([olympic_100m_men, - olympic_100m_women, - olympic_200m_men, - olympic_200m_women, - olympic_400m_men, - olympic_400m_women]): - data = dataset() - year = data['X'] - time = data['Y'] - X = np.vstack((X, np.hstack((year, np.ones_like(year)*i)))) - Y = np.vstack((Y, time)) - data['X'] = X - data['Y'] = Y - data['info'] = "Olympics sprint event winning for men and women to 2008. Data is from Rogers and Girolami's First Course in Machine Learning." - return data_details_return({ - 'X': X, - 'Y': Y, - 'info': "Olympics sprint event winning for men and women to 2008. Data is from Rogers and Girolami's First Course in Machine Learning.", - 'output_info': { - 0:'100m Men', - 1:'100m Women', - 2:'200m Men', - 3:'200m Women', - 4:'400m Men', - 5:'400m Women'} - }, data_set) - -# def movielens_small(partNo=1,seed=default_seed): -# np.random.seed(seed=seed) - -# fileName = os.path.join(data_path, 'movielens', 'small', 'u' + str(partNo) + '.base') -# fid = open(fileName) -# uTrain = np.fromfile(fid, sep='\t', dtype=np.int16).reshape((-1, 4)) -# fid.close() -# maxVals = np.amax(uTrain, axis=0) -# numUsers = maxVals[0] -# numFilms = maxVals[1] -# numRatings = uTrain.shape[0] - -# Y = scipy.sparse.lil_matrix((numFilms, numUsers), dtype=np.int8) -# for i in range(numUsers): -# ind = pb.mlab.find(uTrain[:, 0]==i+1) -# Y[uTrain[ind, 1]-1, i] = uTrain[ind, 2] - -# fileName = os.path.join(data_path, 'movielens', 'small', 'u' + str(partNo) + '.test') -# fid = open(fileName) -# uTest = np.fromfile(fid, sep='\t', dtype=np.int16).reshape((-1, 4)) -# fid.close() -# numTestRatings = uTest.shape[0] - -# Ytest = scipy.sparse.lil_matrix((numFilms, numUsers), dtype=np.int8) -# for i in range(numUsers): -# ind = pb.mlab.find(uTest[:, 0]==i+1) -# Ytest[uTest[ind, 1]-1, i] = uTest[ind, 2] - -# lbls = np.empty((1,1)) -# lblstest = np.empty((1,1)) -# return {'Y':Y, 'lbls':lbls, 'Ytest':Ytest, 'lblstest':lblstest} - - -def crescent_data(num_data=200, seed=default_seed): - """ -Data set formed from a mixture of four Gaussians. In each class two of the Gaussians are elongated at right angles to each other and offset to form an approximation to the crescent data that is popular in semi-supervised learning as a toy problem. - - :param num_data_part: number of data to be sampled (default is 200). - :type num_data: int - :param seed: random seed to be used for data generation. - :type seed: int - - """ - np.random.seed(seed=seed) - sqrt2 = np.sqrt(2) - # Rotation matrix - R = np.array([[sqrt2 / 2, -sqrt2 / 2], [sqrt2 / 2, sqrt2 / 2]]) - # Scaling matrices - scales = [] - scales.append(np.array([[3, 0], [0, 1]])) - scales.append(np.array([[3, 0], [0, 1]])) - scales.append([[1, 0], [0, 3]]) - scales.append([[1, 0], [0, 3]]) - means = [] - means.append(np.array([4, 4])) - means.append(np.array([0, 4])) - means.append(np.array([-4, -4])) - means.append(np.array([0, -4])) - - Xparts = [] - num_data_part = [] - num_data_total = 0 - for i in range(0, 4): - num_data_part.append(round(((i + 1) * num_data) / 4.)) - num_data_part[i] -= num_data_total - part = np.random.normal(size=(num_data_part[i], 2)) - part = np.dot(np.dot(part, scales[i]), R) + means[i] - Xparts.append(part) - num_data_total += num_data_part[i] - X = np.vstack((Xparts[0], Xparts[1], Xparts[2], Xparts[3])) - - Y = np.vstack((np.ones((num_data_part[0] + num_data_part[1], 1)), -np.ones((num_data_part[2] + num_data_part[3], 1)))) - return {'X':X, 'Y':Y, 'info': "Two separate classes of data formed approximately in the shape of two crescents."} - -def creep_data(data_set='creep_rupture'): - """Brun and Yoshida's metal creep rupture data.""" - if not data_available(data_set): - download_data(data_set) - path = os.path.join(data_path, data_set) - tar_file = os.path.join(path, 'creeprupt.tar') - tar = tarfile.open(tar_file) - print('Extracting file.') - tar.extractall(path=path) - tar.close() - all_data = np.loadtxt(os.path.join(data_path, data_set, 'taka')) - y = all_data[:, 1:2].copy() - features = [0] - features.extend(range(2, 31)) - X = all_data[:, features].copy() - return data_details_return({'X': X, 'y': y}, data_set) - -def cmu_mocap_49_balance(data_set='cmu_mocap'): - """Load CMU subject 49's one legged balancing motion that was used by Alvarez, Luengo and Lawrence at AISTATS 2009.""" - train_motions = ['18', '19'] - test_motions = ['20'] - data = cmu_mocap('49', train_motions, test_motions, sample_every=4, data_set=data_set) - data['info'] = "One legged balancing motions from CMU data base subject 49. As used in Alvarez, Luengo and Lawrence at AISTATS 2009. It consists of " + data['info'] - return data - -def cmu_mocap_35_walk_jog(data_set='cmu_mocap'): - """Load CMU subject 35's walking and jogging motions, the same data that was used by Taylor, Roweis and Hinton at NIPS 2007. but without their preprocessing. Also used by Lawrence at AISTATS 2007.""" - train_motions = ['01', '02', '03', '04', '05', '06', - '07', '08', '09', '10', '11', '12', - '13', '14', '15', '16', '17', '19', - '20', '21', '22', '23', '24', '25', - '26', '28', '30', '31', '32', '33', '34'] - test_motions = ['18', '29'] - data = cmu_mocap('35', train_motions, test_motions, sample_every=4, data_set=data_set) - data['info'] = "Walk and jog data from CMU data base subject 35. As used in Tayor, Roweis and Hinton at NIPS 2007, but without their pre-processing (i.e. as used by Lawrence at AISTATS 2007). It consists of " + data['info'] - return data - -def cmu_mocap(subject, train_motions, test_motions=[], sample_every=4, data_set='cmu_mocap'): - """Load a given subject's training and test motions from the CMU motion capture data.""" - # Load in subject skeleton. - subject_dir = os.path.join(data_path, data_set) - - # Make sure the data is downloaded. - all_motions = train_motions + test_motions - resource = cmu_urls_files(([subject], [all_motions])) - data_resources[data_set] = data_resources['cmu_mocap_full'].copy() - data_resources[data_set]['files'] = resource['files'] - data_resources[data_set]['urls'] = resource['urls'] - if resource['urls']: - download_data(data_set) - - skel = GPy.util.mocap.acclaim_skeleton(os.path.join(subject_dir, subject + '.asf')) - - # Set up labels for each sequence - exlbls = np.eye(len(train_motions)) - - # Load sequences - tot_length = 0 - temp_Y = [] - temp_lbls = [] - for i in range(len(train_motions)): - temp_chan = skel.load_channels(os.path.join(subject_dir, subject + '_' + train_motions[i] + '.amc')) - temp_Y.append(temp_chan[::sample_every, :]) - temp_lbls.append(np.tile(exlbls[i, :], (temp_Y[i].shape[0], 1))) - tot_length += temp_Y[i].shape[0] - - Y = np.zeros((tot_length, temp_Y[0].shape[1])) - lbls = np.zeros((tot_length, temp_lbls[0].shape[1])) - - end_ind = 0 - for i in range(len(temp_Y)): - start_ind = end_ind - end_ind += temp_Y[i].shape[0] - Y[start_ind:end_ind, :] = temp_Y[i] - lbls[start_ind:end_ind, :] = temp_lbls[i] - if len(test_motions) > 0: - temp_Ytest = [] - temp_lblstest = [] - - testexlbls = np.eye(len(test_motions)) - tot_test_length = 0 - for i in range(len(test_motions)): - temp_chan = skel.load_channels(os.path.join(subject_dir, subject + '_' + test_motions[i] + '.amc')) - temp_Ytest.append(temp_chan[::sample_every, :]) - temp_lblstest.append(np.tile(testexlbls[i, :], (temp_Ytest[i].shape[0], 1))) - tot_test_length += temp_Ytest[i].shape[0] - - # Load test data - Ytest = np.zeros((tot_test_length, temp_Ytest[0].shape[1])) - lblstest = np.zeros((tot_test_length, temp_lblstest[0].shape[1])) - - end_ind = 0 - for i in range(len(temp_Ytest)): - start_ind = end_ind - end_ind += temp_Ytest[i].shape[0] - Ytest[start_ind:end_ind, :] = temp_Ytest[i] - lblstest[start_ind:end_ind, :] = temp_lblstest[i] - else: - Ytest = None - lblstest = None - - info = 'Subject: ' + subject + '. Training motions: ' - for motion in train_motions: - info += motion + ', ' - info = info[:-2] - if len(test_motions) > 0: - info += '. Test motions: ' - for motion in test_motions: - info += motion + ', ' - info = info[:-2] + '.' - else: - info += '.' - if sample_every != 1: - info += ' Data is sub-sampled to every ' + str(sample_every) + ' frames.' - return data_details_return({'Y': Y, 'lbls' : lbls, 'Ytest': Ytest, 'lblstest' : lblstest, 'info': info, 'skel': skel}, data_set) - - diff --git a/GPy/util/datasets/data_resources_create.py b/GPy/util/datasets/data_resources_create.py deleted file mode 100644 index 919e3ea4..00000000 --- a/GPy/util/datasets/data_resources_create.py +++ /dev/null @@ -1,176 +0,0 @@ -import json - -neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/' -sam_url = 'http://www.cs.nyu.edu/~roweis/data/' -cmu_url = 'http://mocap.cs.cmu.edu/subjects/' - -data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'], - 'files' : [['ankurDataPoseSilhouette.mat']], - 'license' : None, - 'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""", - 'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""}, - - 'boston_housing' : {'urls' : ['http://archive.ics.uci.edu/ml/machine-learning-databases/housing/'], - 'files' : [['Index', 'housing.data', 'housing.names']], - 'citation' : """Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.""", - 'details' : """The Boston Housing data relates house values in Boston to a range of input variables.""", - 'license' : None, - 'size' : 51276 - }, - 'brendan_faces' : {'urls' : [sam_url], - 'files': [['frey_rawface.mat']], - 'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.', - 'details' : """A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.""", - 'license': None, - 'size' : 1100584}, - 'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'], - 'files' : [['allasfamc.zip']], - 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.' - 'The database was created with funding from NSF EIA-0196217.""", - 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""", - 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""", - 'size' : None}, - 'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'], - 'files' : [['creeprupt.tar']], - 'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.', - 'details' : """Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.""", - 'license' : None, - 'size' : 602797}, - 'della_gatta' : {'urls' : [neil_url + 'della_gatta/'], - 'files': [['DellaGattadata.mat']], - 'citation' : 'Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008', - 'details': "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", - 'license':None, - 'size':3729650}, - 'epomeo_gpx' : {'urls' : [neil_url + 'epomeo_gpx/'], - 'files': [['endomondo_1.gpx', 'endomondo_2.gpx', 'garmin_watch_via_endomondo.gpx','viewranger_phone.gpx','viewranger_tablet.gpx']], - 'citation' : '', - 'details': "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", - 'license':None, - 'size': 2031872}, - 'three_phase_oil_flow': {'urls' : [neil_url + 'three_phase_oil_flow/'], - 'files' : [['DataTrnLbls.txt', 'DataTrn.txt', 'DataTst.txt', 'DataTstLbls.txt', 'DataVdn.txt', 'DataVdnLbls.txt']], - 'citation' : 'Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593', - 'details' : """The three phase oil data used initially for demonstrating the Generative Topographic mapping.""", - 'license' : None, - 'size' : 712796}, - 'rogers_girolami_data' : {'urls' : ['https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/'], - 'files' : [['firstcoursemldata.tar.gz']], - 'suffices' : [['?dl=1']], - 'citation' : 'A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146', - 'details' : """Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.""", - 'license' : None, - 'size' : 21949154}, - 'olivetti_faces' : {'urls' : [neil_url + 'olivetti_faces/', sam_url], - 'files' : [['att_faces.zip'], ['olivettifaces.mat']], - 'citation' : 'Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994', - 'details' : """Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. """, - 'license': None, - 'size' : 8561331}, - 'olympic_marathon_men' : {'urls' : [neil_url + 'olympic_marathon_men/'], - 'files' : [['olympicMarathonTimes.csv']], - 'citation' : None, - 'details' : """Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data""", - 'license': None, - 'size' : 584}, - 'osu_run1' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'], - 'files': [['run1TXT.ZIP'],['connections.txt']], - 'details' : "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", - 'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.', - 'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).', - 'size': 338103}, - 'osu_accad' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'], - 'files': [['swagger1TXT.ZIP','handspring1TXT.ZIP','quickwalkTXT.ZIP','run1TXT.ZIP','sprintTXT.ZIP','dogwalkTXT.ZIP','camper_04TXT.ZIP','dance_KB3_TXT.ZIP','per20_TXT.ZIP','perTWO07_TXT.ZIP','perTWO13_TXT.ZIP','perTWO14_TXT.ZIP','perTWO15_TXT.ZIP','perTWO16_TXT.ZIP'],['connections.txt']], - 'details' : "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", - 'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.', - 'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).', - 'size': 15922790}, - 'pumadyn-32nm' : {'urls' : ['ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/'], - 'files' : [['pumadyn-32nm.tar.gz']], - 'details' : """Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.""", - 'citation' : """Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.""", - 'license' : """Data is made available by the Delve system at the University of Toronto""", - 'size' : 5861646}, - 'robot_wireless' : {'urls' : [neil_url + 'robot_wireless/'], - 'files' : [['uw-floor.txt']], - 'citation' : """WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.""", - 'details' : """Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.""", - 'license' : None, - 'size' : 284390}, - 'swiss_roll' : {'urls' : ['http://isomap.stanford.edu/'], - 'files' : [['swiss_roll_data.mat']], - 'details' : """Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""", - 'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000', - 'license' : None, - 'size' : 800256}, - 'ripley_prnn_data' : {'urls' : ['http://www.stats.ox.ac.uk/pub/PRNN/'], - 'files' : [['Cushings.dat', 'README', 'crabs.dat', 'fglass.dat', 'fglass.grp', 'pima.te', 'pima.tr', 'pima.tr2', 'synth.te', 'synth.tr', 'viruses.dat', 'virus3.dat']], - 'details' : """Data sets from Brian Ripley's Pattern Recognition and Neural Networks""", - 'citation': """Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7""", - 'license' : None, - 'size' : 93565}, - 'isomap_face_data' : {'urls' : [neil_url + 'isomap_face_data/'], - 'files' : [['face_data.mat']], - 'details' : """Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""", - 'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000', - 'license' : None, - 'size' : 24229368}, - 'xw_pen' : {'urls' : [neil_url + 'xw_pen/'], - 'files' : [['xw_pen_15.csv']], - 'details' : """Accelerometer pen data used for robust regression by Tipping and Lawrence.""", - 'citation' : 'Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005', - 'license' : None, - 'size' : 3410}, - 'hapmap3' : {'urls' : ['http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/'], - 'files' : [['hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2', 'hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2', 'relationships_w_pops_121708.txt']], - 'details' : """ - HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. - The HapMap phase three SNP dataset - 1184 samples out of 11 populations. - See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details. - - SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]: - Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then - - / 1, iff SNPij==(B1,B1) - Aij = | 0, iff SNPij==(B1,B2) - \ -1, iff SNPij==(B2,B2) - - The SNP data and the meta information (such as iid, sex and phenotype) are - stored in the dataframe datadf, index is the Individual ID, - with following columns for metainfo: - - * family_id -> Family ID - * paternal_id -> Paternal ID - * maternal_id -> Maternal ID - * sex -> Sex (1=male; 2=female; other=unknown) - * phenotype -> Phenotype (-9, or 0 for unknown) - * population -> Population string (e.g. 'ASW' - 'YRI') - * rest are SNP rs (ids) - - More information is given in infodf: - - * Chromosome: - - autosomal chromosemes -> 1-22 - - X X chromosome -> 23 - - Y Y chromosome -> 24 - - XY Pseudo-autosomal region of X -> 25 - - MT Mitochondrial -> 26 - * Relative Positon (to Chromosome) [base pairs] - - """, - 'citation': """Gibbs, Richard A., et al. "The international HapMap project." Nature 426.6968 (2003): 789-796.""", - 'license' : """International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)""", - 'size' : 2*1729092237 + 62265}, - - 'singlecell' : {'urls' : ["http://staffwww.dcs.sheffield.ac.uk/people/M.Zwiessele/data/singlecell/"], - 'files' : [['singlecell.csv']], - 'details' : "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", - 'citation' : "Guoji Guo, Mikael Huss, Guo Qing Tong, Chaoyang Wang, Li Li Sun, Neil D. Clarke, Paul Robson, Resolution of Cell Fate Decisions Revealed by Single-Cell Gene Expression Analysis from Zygote to Blastocyst, Developmental Cell, Volume 18, Issue 4, 20 April 2010, Pages 675-685, ISSN 1534-5807, http://dx.doi.org/10.1016/j.devcel.2010.02.012. (http://www.sciencedirect.com/science/article/pii/S1534580710001103) Keywords: DEVBIO", - 'license' : "ScienceDirect: http://www.elsevier.com/locate/termsandconditions?utm_source=sciencedirect&utm_medium=link&utm_campaign=terms", - 'size' : 233.1, - } - } - -with open('data_resources.json', 'w') as f: - print "writing data_resources" - json.dump(data_resources, f)