Changes to configuration file set up: now uses defaults.cfg, installation.cfg and searches locally for .gpy_user.cfg in the users home directory.

This commit is contained in:
Neil Lawrence 2014-05-28 10:25:02 +01:00
parent 9ea236112e
commit 0fde50f56d
5 changed files with 163 additions and 66 deletions

View file

@ -1,9 +1,14 @@
# This is the configuration file for GPy # This is the default configuration file for GPy
# Do note edit this file.
# For machine specific changes (i.e. those specific to a given installation) edit GPy/installation.cfg
# For user specific changes edit $HOME/.gpy_user.cfg
[parallel] [parallel]
# Enable openmp support. This speeds up some computations, depending on the number # Enable openmp support. This speeds up some computations, depending on the number
# of cores available. Setting up a compiler with openmp support can be difficult on # of cores available. Setting up a compiler with openmp support can be difficult on
# some platforms, hence this option. # some platforms, hence by default it is off.
openmp=False openmp=False
[datasets] [datasets]

2
GPy/installation.cfg Normal file
View file

@ -0,0 +1,2 @@
# This is the local configuration file for GPy

61
GPy/mappings/additive.py Normal file
View file

@ -0,0 +1,61 @@
# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np
from ..core.mapping import Mapping
import GPy
class Additive(Mapping):
"""
Mapping based on adding two existing mappings together.
.. math::
f(\mathbf{x}*) = f_1(\mathbf{x}*) + f_2(\mathbf(x)*)
:param mapping1: first mapping to add together.
:type mapping1: GPy.mappings.Mapping
:param mapping2: second mapping to add together.
:type mapping2: GPy.mappings.Mapping
:param tensor: whether or not to use the tensor product of input spaces
:type tensor: bool
"""
def __init__(self, mapping1, mapping2, tensor=False):
if tensor:
input_dim = mapping1.input_dim + mapping2.input_dim
else:
input_dim = mapping1.input_dim
assert(mapping1.input_dim==mapping2.input_dim)
assert(mapping1.output_dim==mapping2.output_dim)
output_dim = mapping1.output_dim
Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
self.mapping1 = mapping1
self.mapping2 = mapping2
self.num_params = self.mapping1.num_params + self.mapping2.num_params
self.name = self.mapping1.name + '+' + self.mapping2.name
def _get_param_names(self):
return self.mapping1._get_param_names + self.mapping2._get_param_names
def _get_params(self):
return np.hstack((self.mapping1._get_params() self.mapping2._get_params()))
def _set_params(self, x):
self.mapping1._set_params(x[:self.mapping1.num_params])
self.mapping2._set_params(x[self.mapping1.num_params:])
def randomize(self):
self.mapping1._randomize()
self.mapping2._randomize()
def f(self, X):
return self.mapping1.f(X) + self.mapping2.f(X)
def df_dtheta(self, dL_df, X):
self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T
self._df_dbias = (dL_df.sum(0))
return np.hstack((self._df_dA.flatten(), self._df_dbias))
def df_dX(self, dL_df, X):
return self.kern.dK_dX((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X)

View file

@ -5,18 +5,19 @@ import ConfigParser
import os import os
config = ConfigParser.ConfigParser() config = ConfigParser.ConfigParser()
home = os.getenv('HOME') or os.getenv('USERPROFILE') # This is the default configuration file that always needs to be present.
user_file = os.path.join(home,'.gpy_config.cfg') default_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'defaults.cfg'))
default_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'gpy_config.cfg'))
# print user_file, os.path.isfile(user_file)
# print default_file, os.path.isfile(default_file)
# 1. check if the user has a ~/.gpy_config.cfg # These files are optional
if os.path.isfile(user_file): # This specifies configurations that are typically specific to the machine (it is found alongside the GPy installation).
config.read(user_file) local_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'machine.cfg'))
elif os.path.isfile(default_file):
# 2. if not, use the default one # This specifies configurations specific to the user (it is found in the user home directory)
config.read(default_file) home = os.getenv('HOME') or os.getenv('USERPROFILE')
else: user_file = os.path.join(home,'.gpy_user.cfg')
#3. panic
raise ValueError, "no configuration file found" # Read in the given files.
config.readfp(open(default_file))
config.read([local_file, user_file])
if not config:
raise ValueError, "No configuration file found at either " + user_file + " or " + local_file + " or " + default_file + "."

View file

@ -240,7 +240,7 @@ def cmu_urls_files(subj_motions, messages = True):
if not os.path.exists(cur_skel_file): if not os.path.exists(cur_skel_file):
# Current skel file doesn't exist. # Current skel file doesn't exist.
if not os.path.isdir(skel_dir): if not os.path.isdir(skel_dir):
os.mkdir(skel_dir) os.makedirs(skel_dir)
# Add skel file to list. # Add skel file to list.
url_required = True url_required = True
file_download.append(subjects[i] + '.asf') file_download.append(subjects[i] + '.asf')
@ -367,8 +367,8 @@ def sod1_mouse(data_set='sod1_mouse'):
if not data_available(data_set): if not data_available(data_set):
download_data(data_set) download_data(data_set)
from pandas import read_csv from pandas import read_csv
dirpath = os.path.join(data_path, data_set) dir_path = os.path.join(data_path, data_set)
filename = os.path.join(dirpath, 'sod1_C57_129_exprs.csv') filename = os.path.join(dir_path, 'sod1_C57_129_exprs.csv')
Y = read_csv(filename, header=0, index_col=0) Y = read_csv(filename, header=0, index_col=0)
num_repeats=4 num_repeats=4
num_time=4 num_time=4
@ -380,8 +380,8 @@ def spellman_yeast(data_set='spellman_yeast'):
if not data_available(data_set): if not data_available(data_set):
download_data(data_set) download_data(data_set)
from pandas import read_csv from pandas import read_csv
dirpath = os.path.join(data_path, data_set) dir_path = os.path.join(data_path, data_set)
filename = os.path.join(dirpath, 'combined.txt') filename = os.path.join(dir_path, 'combined.txt')
Y = read_csv(filename, header=0, index_col=0, sep='\t') Y = read_csv(filename, header=0, index_col=0, sep='\t')
return data_details_return({'Y': Y}, data_set) return data_details_return({'Y': Y}, data_set)
@ -389,8 +389,8 @@ def spellman_yeast_cdc(data_set='spellman_yeast'):
if not data_available(data_set): if not data_available(data_set):
download_data(data_set) download_data(data_set)
from pandas import read_csv from pandas import read_csv
dirpath = os.path.join(data_path, data_set) dir_path = os.path.join(data_path, data_set)
filename = os.path.join(dirpath, 'combined.txt') filename = os.path.join(dir_path, 'combined.txt')
Y = read_csv(filename, header=0, index_col=0, sep='\t') Y = read_csv(filename, header=0, index_col=0, sep='\t')
t = np.asarray([10, 30, 50, 70, 80, 90, 100, 110, 120, 130, 140, 150, 170, 180, 190, 200, 210, 220, 230, 240, 250, 270, 290]) t = np.asarray([10, 30, 50, 70, 80, 90, 100, 110, 120, 130, 140, 150, 170, 180, 190, 200, 210, 220, 230, 240, 250, 270, 290])
times = ['cdc15_'+str(time) for time in t] times = ['cdc15_'+str(time) for time in t]
@ -403,8 +403,8 @@ def lee_yeast_ChIP(data_set='lee_yeast_ChIP'):
download_data(data_set) download_data(data_set)
from pandas import read_csv from pandas import read_csv
import zipfile import zipfile
dirpath = os.path.join(data_path, data_set) dir_path = os.path.join(data_path, data_set)
filename = os.path.join(dirpath, 'binding_by_gene.tsv') filename = os.path.join(dir_path, 'binding_by_gene.tsv')
X = read_csv(filename, header=1, index_col=0, sep='\t') X = read_csv(filename, header=1, index_col=0, sep='\t')
transcription_factors = [col for col in X.columns if col[:7] != 'Unnamed'] transcription_factors = [col for col in X.columns if col[:7] != 'Unnamed']
annotations = X[['Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3']] annotations = X[['Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3']]
@ -416,8 +416,8 @@ def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None):
if not data_available(data_set): if not data_available(data_set):
download_data(data_set) download_data(data_set)
from pandas import read_csv from pandas import read_csv
dirpath = os.path.join(data_path, data_set) dir_path = os.path.join(data_path, data_set)
filename = os.path.join(dirpath, 'tomancak_exprs.csv') filename = os.path.join(dir_path, 'tomancak_exprs.csv')
Y = read_csv(filename, header=0, index_col=0).T Y = read_csv(filename, header=0, index_col=0).T
num_repeats = 3 num_repeats = 3
num_time = 12 num_time = 12
@ -431,8 +431,8 @@ def drosophila_protein(data_set='drosophila_protein'):
if not data_available(data_set): if not data_available(data_set):
download_data(data_set) download_data(data_set)
from pandas import read_csv from pandas import read_csv
dirpath = os.path.join(data_path, data_set) dir_path = os.path.join(data_path, data_set)
filename = os.path.join(dirpath, 'becker_et_al.csv') filename = os.path.join(dir_path, 'becker_et_al.csv')
Y = read_csv(filename, header=0) Y = read_csv(filename, header=0)
return data_details_return({'Y': Y}, data_set) return data_details_return({'Y': Y}, data_set)
@ -440,8 +440,8 @@ def drosophila_knirps(data_set='drosophila_protein'):
if not data_available(data_set): if not data_available(data_set):
download_data(data_set) download_data(data_set)
from pandas import read_csv from pandas import read_csv
dirpath = os.path.join(data_path, data_set) dir_path = os.path.join(data_path, data_set)
filename = os.path.join(dirpath, 'becker_et_al.csv') filename = os.path.join(dir_path, 'becker_et_al.csv')
# in the csv file we have facts_kni and ext_kni. We treat facts_kni as protein and ext_kni as mRNA # in the csv file we have facts_kni and ext_kni. We treat facts_kni as protein and ext_kni as mRNA
df = read_csv(filename, header=0) df = read_csv(filename, header=0)
t = df['t'][:,None] t = df['t'][:,None]
@ -462,31 +462,59 @@ def drosophila_knirps(data_set='drosophila_protein'):
return data_details_return({'Y': Y, 'X': X}, data_set) return data_details_return({'Y': Y, 'X': X}, data_set)
# This will be for downloading google trends data. # This will be for downloading google trends data.
def google_trends(query_terms=['big data', 'machine learning', 'data science'], data_set='google_trends'): def google_trends(query_terms=['big data', 'machine learning', 'data science'], data_set='google_trends', refresh_data=False):
"""Data downloaded from Google trends for given query terms. Warning, if you use this function multiple times in a row you get blocked due to terms of service violations.""" """Data downloaded from Google trends for given query terms. Warning, if you use this function multiple times in a row you get blocked due to terms of service violations. The function will cache the result of your query, if you wish to refresh an old query set refresh_data to True. The function is inspired by this notebook: http://nbviewer.ipython.org/github/sahuguet/notebooks/blob/master/GoogleTrends%20meet%20Notebook.ipynb"""
# Inspired by this notebook: query_terms.sort()
# http://nbviewer.ipython.org/github/sahuguet/notebooks/blob/master/GoogleTrends%20meet%20Notebook.ipynb import pandas
# Create directory name for data
dir_path = os.path.join(data_path,'google_trends')
if not os.path.isdir(dir_path):
os.makedirs(dir_path)
dir_name = '-'.join(query_terms)
dir_name = dir_name.replace(' ', '_')
dir_path = os.path.join(dir_path,dir_name)
file = 'data.csv'
file_name = os.path.join(dir_path,file)
if not os.path.exists(file_name) or refresh_data:
print "Accessing Google trends to acquire the data. Note that repeated accesses will result in a block due to a google terms of service violation. Failure at this point may be due to such blocks."
# quote the query terms. # quote the query terms.
for i, element in enumerate(query_terms): quoted_terms = []
query_terms[i] = urllib2.quote(element) for term in query_terms:
query = 'http://www.google.com/trends/fetchComponent?q=%s&cid=TIMESERIES_GRAPH_0&export=3' % ",".join(query_terms) quoted_terms.append(urllib2.quote(term))
print "Query terms: ", ', '.join(query_terms)
print "Fetching query:"
query = 'http://www.google.com/trends/fetchComponent?q=%s&cid=TIMESERIES_GRAPH_0&export=3' % ",".join(quoted_terms)
data = urllib2.urlopen(query).read() data = urllib2.urlopen(query).read()
print "Done."
# In the notebook they did some data cleaning: remove Javascript header+footer, and translate new Date(....,..,..) into YYYY-MM-DD. # In the notebook they did some data cleaning: remove Javascript header+footer, and translate new Date(....,..,..) into YYYY-MM-DD.
header = """// Data table response\ngoogle.visualization.Query.setResponse(""" header = """// Data table response\ngoogle.visualization.Query.setResponse("""
data = data[len(header):-2] data = data[len(header):-2]
data = re.sub('new Date\((\d+),(\d+),(\d+)\)', (lambda m: '"%s-%02d-%02d"' % (m.group(1).strip(), 1+int(m.group(2)), int(m.group(3)))), data) data = re.sub('new Date\((\d+),(\d+),(\d+)\)', (lambda m: '"%s-%02d-%02d"' % (m.group(1).strip(), 1+int(m.group(2)), int(m.group(3)))), data)
timeseries = json.loads(data) timeseries = json.loads(data)
#import pandas as pd
columns = [k['label'] for k in timeseries['table']['cols']] columns = [k['label'] for k in timeseries['table']['cols']]
rows = map(lambda x: [k['v'] for k in x['c']], timeseries['table']['rows']) rows = map(lambda x: [k['v'] for k in x['c']], timeseries['table']['rows'])
terms = len(columns)-1 df = pandas.DataFrame(rows, columns=columns)
X = np.asarray([(pb.datestr2num(row[0]), i) for i in range(terms) for row in rows ]) if not os.path.isdir(dir_path):
Y = np.asarray([[row[i+1]] for i in range(terms) for row in rows ]) os.makedirs(dir_path)
df.to_csv(file_name)
else:
print "Reading cached data for google trends. To refresh the cache set 'refresh_data=True' when calling this function."
print "Query terms: ", ', '.join(query_terms)
df = pandas.read_csv(file_name, parse_dates=[0])
columns = df.columns
terms = len(query_terms)
import datetime
X = np.asarray([(row, i) for i in range(terms) for row in df.index])
Y = np.asarray([[df.ix[row][query_terms[i]]] for i in range(terms) for row in df.index ])
output_info = columns[1:] output_info = columns[1:]
return data_details_return({'X': X, 'Y': Y, 'query_terms': output_info, 'info': "Data downloaded from google trends with query terms: " + ', '.join(output_info) + '.'}, data_set)
return data_details_return({'data frame' : df, 'X': X, 'Y': Y, 'query_terms': output_info, 'info': "Data downloaded from google trends with query terms: " + ', '.join(output_info) + '.'}, data_set)
# The data sets # The data sets
def oil(data_set='three_phase_oil_flow'): def oil(data_set='three_phase_oil_flow'):
@ -630,7 +658,7 @@ def ripley_synth(data_set='ripley_prnn_data'):
ytest = test[:, 2:3] ytest = test[:, 2:3]
return data_details_return({'X': X, 'Y': y, 'Xtest': Xtest, 'Ytest': ytest, 'info': 'Synthetic data generated by Ripley for a two class classification problem.'}, data_set) return data_details_return({'X': X, 'Y': y, 'Xtest': Xtest, 'Ytest': ytest, 'info': 'Synthetic data generated by Ripley for a two class classification problem.'}, data_set)
def mauna_loa(data_set='mauna_loa', num_train=543, refresh_data=False): def mauna_loa(data_set='mauna_loa', num_train=545, refresh_data=False):
path = os.path.join(data_path, data_set) path = os.path.join(data_path, data_set)
if data_available(data_set) and not refresh_data: if data_available(data_set) and not refresh_data:
print 'Using cached version of the data set, to use latest version set refresh_data to True' print 'Using cached version of the data set, to use latest version set refresh_data to True'
@ -724,15 +752,15 @@ def hapmap3(data_set='hapmap3'):
except ImportError as i: except ImportError as i:
raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset" raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
dirpath = os.path.join(data_path,'hapmap3') dir_path = os.path.join(data_path,'hapmap3')
hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly' hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly'
unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']] unpacked_files = [os.path.join(dir_path, hapmap_file_name+ending) for ending in ['.ped', '.map']]
unpacked_files_exist = reduce(lambda a, b:a and b, map(os.path.exists, unpacked_files)) unpacked_files_exist = reduce(lambda a, b:a and b, map(os.path.exists, unpacked_files))
if not unpacked_files_exist and not data_available(data_set): if not unpacked_files_exist and not data_available(data_set):
download_data(data_set) download_data(data_set)
preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \ preprocessed_data_paths = [os.path.join(dir_path,hapmap_file_name + file_name) for file_name in \
['.snps.pickle', ['.snps.pickle',
'.info.pickle', '.info.pickle',
'.nan.pickle']] '.nan.pickle']]
@ -775,7 +803,7 @@ def hapmap3(data_set='hapmap3'):
mapnp = np.loadtxt(unpacked_files[1], dtype=str) mapnp = np.loadtxt(unpacked_files[1], dtype=str)
status=write_status('reading relationships.txt...', 42, status) status=write_status('reading relationships.txt...', 42, status)
# and metainfo: # and metainfo:
infodf = DataFrame.from_csv(os.path.join(dirpath,'./relationships_w_pops_121708.txt'), header=0, sep='\t') infodf = DataFrame.from_csv(os.path.join(dir_path,'./relationships_w_pops_121708.txt'), header=0, sep='\t')
infodf.set_index('IID', inplace=1) infodf.set_index('IID', inplace=1)
status=write_status('filtering nan...', 45, status) status=write_status('filtering nan...', 45, status)
snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2) snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2)
@ -840,12 +868,12 @@ def singlecell(data_set='singlecell'):
download_data(data_set) download_data(data_set)
from pandas import read_csv from pandas import read_csv
dirpath = os.path.join(data_path, data_set) dir_path = os.path.join(data_path, data_set)
filename = os.path.join(dirpath, 'singlecell.csv') filename = os.path.join(dir_path, 'singlecell.csv')
Y = read_csv(filename, header=0, index_col=0) Y = read_csv(filename, header=0, index_col=0)
genes = Y.columns genes = Y.columns
labels = Y.index labels = Y.index
# data = np.loadtxt(os.path.join(dirpath, 'singlecell.csv'), delimiter=",", dtype=str) # data = np.loadtxt(os.path.join(dir_path, 'singlecell.csv'), delimiter=",", dtype=str)
return data_details_return({'Y': Y, 'info' : "qPCR singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", return data_details_return({'Y': Y, 'info' : "qPCR singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]",
'genes': genes, 'labels':labels, 'genes': genes, 'labels':labels,
}, data_set) }, data_set)
@ -1145,19 +1173,19 @@ def creep_data(data_set='creep_rupture'):
X = all_data[:, features].copy() X = all_data[:, features].copy()
return data_details_return({'X': X, 'y': y}, data_set) return data_details_return({'X': X, 'y': y}, data_set)
def cifar10(data_set='cifar-10'): def cifar10_patches(data_set='cifar-10'):
"""The Candian Institute for Advanced Research 10 image data set. Code for loading in this data is taken from this Boris Babenko's blog post, original code available here: http://bbabenko.tumblr.com/post/86756017649/learning-low-level-vision-feautres-in-10-lines-of-code""" """The Candian Institute for Advanced Research 10 image data set. Code for loading in this data is taken from this Boris Babenko's blog post, original code available here: http://bbabenko.tumblr.com/post/86756017649/learning-low-level-vision-feautres-in-10-lines-of-code"""
dirpath = os.path.join(data_path, data_set) dir_path = os.path.join(data_path, data_set)
filename = os.path.join(dirpath, 'cifar-10-python.tar.gz') filename = os.path.join(dir_path, 'cifar-10-python.tar.gz')
if not data_available(data_set): if not data_available(data_set):
download_data(data_set) download_data(data_set)
import tarfile import tarfile
# This code is from Boris Babenko's blog post. # This code is from Boris Babenko's blog post.
# http://bbabenko.tumblr.com/post/86756017649/learning-low-level-vision-feautres-in-10-lines-of-code # http://bbabenko.tumblr.com/post/86756017649/learning-low-level-vision-feautres-in-10-lines-of-code
tfile = tarfile.open(filename, 'r:gz') tfile = tarfile.open(filename, 'r:gz')
tfile.extractall(dirpath) tfile.extractall(dir_path)
with open(os.path.join(dirpath, 'cifar-10-batches-py','data_batch_1'),'rb') as f: with open(os.path.join(dir_path, 'cifar-10-batches-py','data_batch_1'),'rb') as f:
data = pickle.load(f) data = pickle.load(f)
images = data['data'].reshape((-1,3,32,32)).astype('float32')/255 images = data['data'].reshape((-1,3,32,32)).astype('float32')/255
@ -1167,7 +1195,7 @@ def cifar10(data_set='cifar-10'):
for y in range(0,32-5,5): for y in range(0,32-5,5):
patches = np.concatenate((patches, images[:,x:x+5,y:y+5,:]), axis=0) patches = np.concatenate((patches, images[:,x:x+5,y:y+5,:]), axis=0)
patches = patches.reshape((patches.shape[0],-1)) patches = patches.reshape((patches.shape[0],-1))
return data_details_return({'Y': patches}, data_set) return data_details_return({'Y': patches, "info" : "32x32 pixel patches extracted from the CIFAR-10 data by Boris Babenko to demonstrate k-means features."}, data_set)
def cmu_mocap_49_balance(data_set='cmu_mocap'): def cmu_mocap_49_balance(data_set='cmu_mocap'):
"""Load CMU subject 49's one legged balancing motion that was used by Alvarez, Luengo and Lawrence at AISTATS 2009.""" """Load CMU subject 49's one legged balancing motion that was used by Alvarez, Luengo and Lawrence at AISTATS 2009."""