From 0fde50f56df88e6448c0be22cee42a0afedd2cab Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Wed, 28 May 2014 10:25:02 +0100 Subject: [PATCH] Changes to configuration file set up: now uses defaults.cfg, installation.cfg and searches locally for .gpy_user.cfg in the users home directory. --- GPy/{gpy_config.cfg => defaults.cfg} | 9 +- GPy/installation.cfg | 2 + GPy/mappings/additive.py | 61 +++++++++++++ GPy/util/config.py | 29 +++--- GPy/util/datasets.py | 128 ++++++++++++++++----------- 5 files changed, 163 insertions(+), 66 deletions(-) rename GPy/{gpy_config.cfg => defaults.cfg} (62%) create mode 100644 GPy/installation.cfg create mode 100644 GPy/mappings/additive.py diff --git a/GPy/gpy_config.cfg b/GPy/defaults.cfg similarity index 62% rename from GPy/gpy_config.cfg rename to GPy/defaults.cfg index 43cd0ebe..50cc1107 100644 --- a/GPy/gpy_config.cfg +++ b/GPy/defaults.cfg @@ -1,9 +1,14 @@ -# This is the configuration file for GPy +# This is the default configuration file for GPy +# Do note edit this file. + +# For machine specific changes (i.e. those specific to a given installation) edit GPy/installation.cfg + +# For user specific changes edit $HOME/.gpy_user.cfg [parallel] # Enable openmp support. This speeds up some computations, depending on the number # of cores available. Setting up a compiler with openmp support can be difficult on -# some platforms, hence this option. +# some platforms, hence by default it is off. openmp=False [datasets] diff --git a/GPy/installation.cfg b/GPy/installation.cfg new file mode 100644 index 00000000..867a15bf --- /dev/null +++ b/GPy/installation.cfg @@ -0,0 +1,2 @@ +# This is the local configuration file for GPy + diff --git a/GPy/mappings/additive.py b/GPy/mappings/additive.py new file mode 100644 index 00000000..fe352a83 --- /dev/null +++ b/GPy/mappings/additive.py @@ -0,0 +1,61 @@ +# Copyright (c) 2013, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +from ..core.mapping import Mapping +import GPy + +class Additive(Mapping): + """ + Mapping based on adding two existing mappings together. + + .. math:: + + f(\mathbf{x}*) = f_1(\mathbf{x}*) + f_2(\mathbf(x)*) + + :param mapping1: first mapping to add together. + :type mapping1: GPy.mappings.Mapping + :param mapping2: second mapping to add together. + :type mapping2: GPy.mappings.Mapping + :param tensor: whether or not to use the tensor product of input spaces + :type tensor: bool + + """ + + def __init__(self, mapping1, mapping2, tensor=False): + if tensor: + input_dim = mapping1.input_dim + mapping2.input_dim + else: + input_dim = mapping1.input_dim + assert(mapping1.input_dim==mapping2.input_dim) + assert(mapping1.output_dim==mapping2.output_dim) + output_dim = mapping1.output_dim + Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim) + self.mapping1 = mapping1 + self.mapping2 = mapping2 + self.num_params = self.mapping1.num_params + self.mapping2.num_params + self.name = self.mapping1.name + '+' + self.mapping2.name + def _get_param_names(self): + return self.mapping1._get_param_names + self.mapping2._get_param_names + + def _get_params(self): + return np.hstack((self.mapping1._get_params() self.mapping2._get_params())) + + def _set_params(self, x): + self.mapping1._set_params(x[:self.mapping1.num_params]) + self.mapping2._set_params(x[self.mapping1.num_params:]) + + def randomize(self): + self.mapping1._randomize() + self.mapping2._randomize() + + def f(self, X): + return self.mapping1.f(X) + self.mapping2.f(X) + + def df_dtheta(self, dL_df, X): + self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T + self._df_dbias = (dL_df.sum(0)) + return np.hstack((self._df_dA.flatten(), self._df_dbias)) + + def df_dX(self, dL_df, X): + return self.kern.dK_dX((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X) diff --git a/GPy/util/config.py b/GPy/util/config.py index b0789fe0..28007fdf 100644 --- a/GPy/util/config.py +++ b/GPy/util/config.py @@ -5,18 +5,19 @@ import ConfigParser import os config = ConfigParser.ConfigParser() -home = os.getenv('HOME') or os.getenv('USERPROFILE') -user_file = os.path.join(home,'.gpy_config.cfg') -default_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'gpy_config.cfg')) -# print user_file, os.path.isfile(user_file) -# print default_file, os.path.isfile(default_file) +# This is the default configuration file that always needs to be present. +default_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'defaults.cfg')) -# 1. check if the user has a ~/.gpy_config.cfg -if os.path.isfile(user_file): - config.read(user_file) -elif os.path.isfile(default_file): - # 2. if not, use the default one - config.read(default_file) -else: - #3. panic - raise ValueError, "no configuration file found" +# These files are optional +# This specifies configurations that are typically specific to the machine (it is found alongside the GPy installation). +local_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'machine.cfg')) + +# This specifies configurations specific to the user (it is found in the user home directory) +home = os.getenv('HOME') or os.getenv('USERPROFILE') +user_file = os.path.join(home,'.gpy_user.cfg') + +# Read in the given files. +config.readfp(open(default_file)) +config.read([local_file, user_file]) +if not config: + raise ValueError, "No configuration file found at either " + user_file + " or " + local_file + " or " + default_file + "." diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index bc0eab8d..44c9a930 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -240,7 +240,7 @@ def cmu_urls_files(subj_motions, messages = True): if not os.path.exists(cur_skel_file): # Current skel file doesn't exist. if not os.path.isdir(skel_dir): - os.mkdir(skel_dir) + os.makedirs(skel_dir) # Add skel file to list. url_required = True file_download.append(subjects[i] + '.asf') @@ -367,8 +367,8 @@ def sod1_mouse(data_set='sod1_mouse'): if not data_available(data_set): download_data(data_set) from pandas import read_csv - dirpath = os.path.join(data_path, data_set) - filename = os.path.join(dirpath, 'sod1_C57_129_exprs.csv') + dir_path = os.path.join(data_path, data_set) + filename = os.path.join(dir_path, 'sod1_C57_129_exprs.csv') Y = read_csv(filename, header=0, index_col=0) num_repeats=4 num_time=4 @@ -380,8 +380,8 @@ def spellman_yeast(data_set='spellman_yeast'): if not data_available(data_set): download_data(data_set) from pandas import read_csv - dirpath = os.path.join(data_path, data_set) - filename = os.path.join(dirpath, 'combined.txt') + dir_path = os.path.join(data_path, data_set) + filename = os.path.join(dir_path, 'combined.txt') Y = read_csv(filename, header=0, index_col=0, sep='\t') return data_details_return({'Y': Y}, data_set) @@ -389,8 +389,8 @@ def spellman_yeast_cdc(data_set='spellman_yeast'): if not data_available(data_set): download_data(data_set) from pandas import read_csv - dirpath = os.path.join(data_path, data_set) - filename = os.path.join(dirpath, 'combined.txt') + dir_path = os.path.join(data_path, data_set) + filename = os.path.join(dir_path, 'combined.txt') Y = read_csv(filename, header=0, index_col=0, sep='\t') t = np.asarray([10, 30, 50, 70, 80, 90, 100, 110, 120, 130, 140, 150, 170, 180, 190, 200, 210, 220, 230, 240, 250, 270, 290]) times = ['cdc15_'+str(time) for time in t] @@ -403,8 +403,8 @@ def lee_yeast_ChIP(data_set='lee_yeast_ChIP'): download_data(data_set) from pandas import read_csv import zipfile - dirpath = os.path.join(data_path, data_set) - filename = os.path.join(dirpath, 'binding_by_gene.tsv') + dir_path = os.path.join(data_path, data_set) + filename = os.path.join(dir_path, 'binding_by_gene.tsv') X = read_csv(filename, header=1, index_col=0, sep='\t') transcription_factors = [col for col in X.columns if col[:7] != 'Unnamed'] annotations = X[['Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3']] @@ -416,8 +416,8 @@ def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None): if not data_available(data_set): download_data(data_set) from pandas import read_csv - dirpath = os.path.join(data_path, data_set) - filename = os.path.join(dirpath, 'tomancak_exprs.csv') + dir_path = os.path.join(data_path, data_set) + filename = os.path.join(dir_path, 'tomancak_exprs.csv') Y = read_csv(filename, header=0, index_col=0).T num_repeats = 3 num_time = 12 @@ -431,8 +431,8 @@ def drosophila_protein(data_set='drosophila_protein'): if not data_available(data_set): download_data(data_set) from pandas import read_csv - dirpath = os.path.join(data_path, data_set) - filename = os.path.join(dirpath, 'becker_et_al.csv') + dir_path = os.path.join(data_path, data_set) + filename = os.path.join(dir_path, 'becker_et_al.csv') Y = read_csv(filename, header=0) return data_details_return({'Y': Y}, data_set) @@ -440,8 +440,8 @@ def drosophila_knirps(data_set='drosophila_protein'): if not data_available(data_set): download_data(data_set) from pandas import read_csv - dirpath = os.path.join(data_path, data_set) - filename = os.path.join(dirpath, 'becker_et_al.csv') + dir_path = os.path.join(data_path, data_set) + filename = os.path.join(dir_path, 'becker_et_al.csv') # in the csv file we have facts_kni and ext_kni. We treat facts_kni as protein and ext_kni as mRNA df = read_csv(filename, header=0) t = df['t'][:,None] @@ -462,31 +462,59 @@ def drosophila_knirps(data_set='drosophila_protein'): return data_details_return({'Y': Y, 'X': X}, data_set) # This will be for downloading google trends data. -def google_trends(query_terms=['big data', 'machine learning', 'data science'], data_set='google_trends'): - """Data downloaded from Google trends for given query terms. Warning, if you use this function multiple times in a row you get blocked due to terms of service violations.""" - # Inspired by this notebook: - # http://nbviewer.ipython.org/github/sahuguet/notebooks/blob/master/GoogleTrends%20meet%20Notebook.ipynb +def google_trends(query_terms=['big data', 'machine learning', 'data science'], data_set='google_trends', refresh_data=False): + """Data downloaded from Google trends for given query terms. Warning, if you use this function multiple times in a row you get blocked due to terms of service violations. The function will cache the result of your query, if you wish to refresh an old query set refresh_data to True. The function is inspired by this notebook: http://nbviewer.ipython.org/github/sahuguet/notebooks/blob/master/GoogleTrends%20meet%20Notebook.ipynb""" + query_terms.sort() + import pandas + + # Create directory name for data + dir_path = os.path.join(data_path,'google_trends') + if not os.path.isdir(dir_path): + os.makedirs(dir_path) + dir_name = '-'.join(query_terms) + dir_name = dir_name.replace(' ', '_') + dir_path = os.path.join(dir_path,dir_name) + file = 'data.csv' + file_name = os.path.join(dir_path,file) + if not os.path.exists(file_name) or refresh_data: + print "Accessing Google trends to acquire the data. Note that repeated accesses will result in a block due to a google terms of service violation. Failure at this point may be due to such blocks." + # quote the query terms. + quoted_terms = [] + for term in query_terms: + quoted_terms.append(urllib2.quote(term)) + print "Query terms: ", ', '.join(query_terms) - # quote the query terms. - for i, element in enumerate(query_terms): - query_terms[i] = urllib2.quote(element) - query = 'http://www.google.com/trends/fetchComponent?q=%s&cid=TIMESERIES_GRAPH_0&export=3' % ",".join(query_terms) + print "Fetching query:" + query = 'http://www.google.com/trends/fetchComponent?q=%s&cid=TIMESERIES_GRAPH_0&export=3' % ",".join(quoted_terms) - data = urllib2.urlopen(query).read() + data = urllib2.urlopen(query).read() + print "Done." + # In the notebook they did some data cleaning: remove Javascript header+footer, and translate new Date(....,..,..) into YYYY-MM-DD. + header = """// Data table response\ngoogle.visualization.Query.setResponse(""" + data = data[len(header):-2] + data = re.sub('new Date\((\d+),(\d+),(\d+)\)', (lambda m: '"%s-%02d-%02d"' % (m.group(1).strip(), 1+int(m.group(2)), int(m.group(3)))), data) + timeseries = json.loads(data) + columns = [k['label'] for k in timeseries['table']['cols']] + rows = map(lambda x: [k['v'] for k in x['c']], timeseries['table']['rows']) + df = pandas.DataFrame(rows, columns=columns) + if not os.path.isdir(dir_path): + os.makedirs(dir_path) - # In the notebook they did some data cleaning: remove Javascript header+footer, and translate new Date(....,..,..) into YYYY-MM-DD. - header = """// Data table response\ngoogle.visualization.Query.setResponse(""" - data = data[len(header):-2] - data = re.sub('new Date\((\d+),(\d+),(\d+)\)', (lambda m: '"%s-%02d-%02d"' % (m.group(1).strip(), 1+int(m.group(2)), int(m.group(3)))), data) - timeseries = json.loads(data) - #import pandas as pd - columns = [k['label'] for k in timeseries['table']['cols']] - rows = map(lambda x: [k['v'] for k in x['c']], timeseries['table']['rows']) - terms = len(columns)-1 - X = np.asarray([(pb.datestr2num(row[0]), i) for i in range(terms) for row in rows ]) - Y = np.asarray([[row[i+1]] for i in range(terms) for row in rows ]) + df.to_csv(file_name) + else: + print "Reading cached data for google trends. To refresh the cache set 'refresh_data=True' when calling this function." + print "Query terms: ", ', '.join(query_terms) + + df = pandas.read_csv(file_name, parse_dates=[0]) + + columns = df.columns + terms = len(query_terms) + import datetime + X = np.asarray([(row, i) for i in range(terms) for row in df.index]) + Y = np.asarray([[df.ix[row][query_terms[i]]] for i in range(terms) for row in df.index ]) output_info = columns[1:] - return data_details_return({'X': X, 'Y': Y, 'query_terms': output_info, 'info': "Data downloaded from google trends with query terms: " + ', '.join(output_info) + '.'}, data_set) + + return data_details_return({'data frame' : df, 'X': X, 'Y': Y, 'query_terms': output_info, 'info': "Data downloaded from google trends with query terms: " + ', '.join(output_info) + '.'}, data_set) # The data sets def oil(data_set='three_phase_oil_flow'): @@ -630,7 +658,7 @@ def ripley_synth(data_set='ripley_prnn_data'): ytest = test[:, 2:3] return data_details_return({'X': X, 'Y': y, 'Xtest': Xtest, 'Ytest': ytest, 'info': 'Synthetic data generated by Ripley for a two class classification problem.'}, data_set) -def mauna_loa(data_set='mauna_loa', num_train=543, refresh_data=False): +def mauna_loa(data_set='mauna_loa', num_train=545, refresh_data=False): path = os.path.join(data_path, data_set) if data_available(data_set) and not refresh_data: print 'Using cached version of the data set, to use latest version set refresh_data to True' @@ -724,15 +752,15 @@ def hapmap3(data_set='hapmap3'): except ImportError as i: raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset" - dirpath = os.path.join(data_path,'hapmap3') + dir_path = os.path.join(data_path,'hapmap3') hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly' - unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']] + unpacked_files = [os.path.join(dir_path, hapmap_file_name+ending) for ending in ['.ped', '.map']] unpacked_files_exist = reduce(lambda a, b:a and b, map(os.path.exists, unpacked_files)) if not unpacked_files_exist and not data_available(data_set): download_data(data_set) - preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \ + preprocessed_data_paths = [os.path.join(dir_path,hapmap_file_name + file_name) for file_name in \ ['.snps.pickle', '.info.pickle', '.nan.pickle']] @@ -775,7 +803,7 @@ def hapmap3(data_set='hapmap3'): mapnp = np.loadtxt(unpacked_files[1], dtype=str) status=write_status('reading relationships.txt...', 42, status) # and metainfo: - infodf = DataFrame.from_csv(os.path.join(dirpath,'./relationships_w_pops_121708.txt'), header=0, sep='\t') + infodf = DataFrame.from_csv(os.path.join(dir_path,'./relationships_w_pops_121708.txt'), header=0, sep='\t') infodf.set_index('IID', inplace=1) status=write_status('filtering nan...', 45, status) snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2) @@ -840,12 +868,12 @@ def singlecell(data_set='singlecell'): download_data(data_set) from pandas import read_csv - dirpath = os.path.join(data_path, data_set) - filename = os.path.join(dirpath, 'singlecell.csv') + dir_path = os.path.join(data_path, data_set) + filename = os.path.join(dir_path, 'singlecell.csv') Y = read_csv(filename, header=0, index_col=0) genes = Y.columns labels = Y.index - # data = np.loadtxt(os.path.join(dirpath, 'singlecell.csv'), delimiter=",", dtype=str) + # data = np.loadtxt(os.path.join(dir_path, 'singlecell.csv'), delimiter=",", dtype=str) return data_details_return({'Y': Y, 'info' : "qPCR singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", 'genes': genes, 'labels':labels, }, data_set) @@ -1145,19 +1173,19 @@ def creep_data(data_set='creep_rupture'): X = all_data[:, features].copy() return data_details_return({'X': X, 'y': y}, data_set) -def cifar10(data_set='cifar-10'): +def cifar10_patches(data_set='cifar-10'): """The Candian Institute for Advanced Research 10 image data set. Code for loading in this data is taken from this Boris Babenko's blog post, original code available here: http://bbabenko.tumblr.com/post/86756017649/learning-low-level-vision-feautres-in-10-lines-of-code""" - dirpath = os.path.join(data_path, data_set) - filename = os.path.join(dirpath, 'cifar-10-python.tar.gz') + dir_path = os.path.join(data_path, data_set) + filename = os.path.join(dir_path, 'cifar-10-python.tar.gz') if not data_available(data_set): download_data(data_set) import tarfile # This code is from Boris Babenko's blog post. # http://bbabenko.tumblr.com/post/86756017649/learning-low-level-vision-feautres-in-10-lines-of-code tfile = tarfile.open(filename, 'r:gz') - tfile.extractall(dirpath) + tfile.extractall(dir_path) - with open(os.path.join(dirpath, 'cifar-10-batches-py','data_batch_1'),'rb') as f: + with open(os.path.join(dir_path, 'cifar-10-batches-py','data_batch_1'),'rb') as f: data = pickle.load(f) images = data['data'].reshape((-1,3,32,32)).astype('float32')/255 @@ -1167,7 +1195,7 @@ def cifar10(data_set='cifar-10'): for y in range(0,32-5,5): patches = np.concatenate((patches, images[:,x:x+5,y:y+5,:]), axis=0) patches = patches.reshape((patches.shape[0],-1)) - return data_details_return({'Y': patches}, data_set) + return data_details_return({'Y': patches, "info" : "32x32 pixel patches extracted from the CIFAR-10 data by Boris Babenko to demonstrate k-means features."}, data_set) def cmu_mocap_49_balance(data_set='cmu_mocap'): """Load CMU subject 49's one legged balancing motion that was used by Alvarez, Luengo and Lawrence at AISTATS 2009."""