yak shaving and whitespaces

This commit is contained in:
mzwiessele 2014-06-30 10:25:49 -07:00
parent b9e61b0ed1
commit 8a8818aa91
4 changed files with 35 additions and 34 deletions

View file

@ -253,7 +253,7 @@ class Model(Parameterized):
def _checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3):
"""
Check the gradient of the ,odel by comparing to a numerical
estimate. If the verbose flag is passed, invividual
estimate. If the verbose flag is passed, individual
components are tested (and printed)
:param verbose: If True, print a "full" checking of each parameter

View file

@ -37,7 +37,7 @@ def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan
# k = GPy.kern.RBF(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)
p = .3
m = GPy.models.BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
if nan:
@ -144,7 +144,7 @@ def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=25, Q=4
m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
m.data_colors = c
m.data_t = t
if optimize:
m.optimize('bfgs', messages=verbose, max_iters=2e3)
@ -169,7 +169,7 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40,
Y = data['X'][:N]
m = GPy.models.BayesianGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k)
m.data_labels = data['Y'][:N].argmax(axis=1)
if optimize:
m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05)
@ -304,7 +304,7 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
Y[inan] = _np.nan
m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing,
m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing,
inference_method=VarDTCMissingData(inan=inan), kernel=k)
m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape)
@ -364,7 +364,7 @@ def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim
for inan in inanlist:
imlist.append(VarDTCMissingData(limit=1, inan=inan))
m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing,
m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing,
kernel=k, inference_method=imlist,
initx="random", initz='permute', **kw)
@ -410,11 +410,11 @@ def olivetti_faces(optimize=True, verbose=True, plot=True):
Yn /= Yn.std()
m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=20)
if optimize: m.optimize('bfgs', messages=verbose, max_iters=1000)
if plot:
ax = m.plot_latent(which_indices=(0, 1))
y = m.likelihood.Y[0, :]
y = m.Y[0, :]
data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
lvm = GPy.plotting.matplot_dep.visualize.lvm(m.X.mean[0, :].copy(), m, data_show, ax)
raw_input('Press enter to finish')
@ -514,7 +514,7 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
data = GPy.util.datasets.osu_run1()
Q = 6
kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(.5, Q), ARD=True)
kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(.5, Q), ARD=True)
m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
m.data = data
@ -566,7 +566,7 @@ def ssgplvm_simulation_linear():
import GPy
N, D, Q = 1000, 20, 5
pi = 0.2
def sample_X(Q, pi):
x = np.empty(Q)
dies = np.random.rand(Q)
@ -576,7 +576,7 @@ def ssgplvm_simulation_linear():
else:
x[q] = 0.
return x
Y = np.empty((N,D))
X = np.empty((N,Q))
# Generate data from random sampled weight matrices
@ -584,4 +584,4 @@ def ssgplvm_simulation_linear():
X[n] = sample_X(Q,pi)
w = np.random.randn(D,Q)
Y[n] = np.dot(w,X[n])

View file

@ -230,7 +230,7 @@ class VarDTCMissingData(LatentFunctionInference):
size = len(csa)
for i, (v,ind) in enumerate(csa.iteritems()):
if not np.all(v):
logger.info('preparing subarrays {:.3%}'.format((i+1.)/size))
logger.info('preparing subarrays {:3.3%}'.format((i+1.)/size))
v = ~np.array(v, dtype=bool)
ind = np.array(ind, dtype=int)
if ind.size == Y.shape[1]:

View file

@ -51,7 +51,7 @@ if not (on_rtd):
json_data=open(path).read()
football_dict = json.loads(json_data)
def prompt_user(prompt):
"""Ask user for agreeing to data set licenses."""
@ -128,14 +128,14 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix
f.write(buff)
sys.stdout.write(" "*(len(status)) + "\r")
if file_size:
status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1048576.),
full=file_size/(1048576.), ll=line_length,
status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1048576.),
full=file_size/(1048576.), ll=line_length,
perc="="*int(line_length*float(file_size_dl)/file_size))
else:
status = r"[{perc: <{ll}}] {dl:7.3f}MB".format(dl=file_size_dl/(1048576.),
ll=line_length,
status = r"[{perc: <{ll}}] {dl:7.3f}MB".format(dl=file_size_dl/(1048576.),
ll=line_length,
perc="."*int(line_length*float(file_size_dl/(10*1048576.))))
sys.stdout.write(status)
sys.stdout.flush()
sys.stdout.write(" "*(len(status)) + "\r")
@ -320,7 +320,7 @@ def della_gatta_TRP63_gene_expression(data_set='della_gatta', gene_number=None):
Y = Y[:, None]
return data_details_return({'X': X, 'Y': Y, 'gene_number' : gene_number}, data_set)
def football_data(season='1314', data_set='football_data'):
"""Football data from English games since 1993. This downloads data from football-data.co.uk for the given season. """
@ -406,11 +406,11 @@ def lee_yeast_ChIP(data_set='lee_yeast_ChIP'):
dir_path = os.path.join(data_path, data_set)
filename = os.path.join(dir_path, 'binding_by_gene.tsv')
S = read_csv(filename, header=1, index_col=0, sep='\t')
transcription_factors = [col for col in S.columns if col[:7] != 'Unnamed']
transcription_factors = [col for col in S.columns if col[:7] != 'Unnamed']
annotations = S[['Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3']]
S = S[transcription_factors]
return data_details_return({'annotations' : annotations, 'Y' : S, 'transcription_factors': transcription_factors}, data_set)
def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None):
@ -425,7 +425,7 @@ def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None):
xt = np.linspace(0, num_time-1, num_time)
xr = np.linspace(0, num_repeats-1, num_repeats)
xtime, xrepeat = np.meshgrid(xt, xr)
X = np.vstack((xtime.flatten(), xrepeat.flatten())).T
X = np.vstack((xtime.flatten(), xrepeat.flatten())).T
return data_details_return({'X': X, 'Y': Y, 'gene_number' : gene_number}, data_set)
def drosophila_protein(data_set='drosophila_protein'):
@ -467,7 +467,7 @@ def google_trends(query_terms=['big data', 'machine learning', 'data science'],
"""Data downloaded from Google trends for given query terms. Warning, if you use this function multiple times in a row you get blocked due to terms of service violations. The function will cache the result of your query, if you wish to refresh an old query set refresh_data to True. The function is inspired by this notebook: http://nbviewer.ipython.org/github/sahuguet/notebooks/blob/master/GoogleTrends%20meet%20Notebook.ipynb"""
query_terms.sort()
import pandas
# Create directory name for data
dir_path = os.path.join(data_path,'google_trends')
if not os.path.isdir(dir_path):
@ -514,9 +514,9 @@ def google_trends(query_terms=['big data', 'machine learning', 'data science'],
X = np.asarray([(row, i) for i in range(terms) for row in df.index])
Y = np.asarray([[df.ix[row][query_terms[i]]] for i in range(terms) for row in df.index ])
output_info = columns[1:]
return data_details_return({'data frame' : df, 'X': X, 'Y': Y, 'query_terms': output_info, 'info': "Data downloaded from google trends with query terms: " + ', '.join(output_info) + '.'}, data_set)
# The data sets
def oil(data_set='three_phase_oil_flow'):
"""The three phase oil data from Bishop and James (1993)."""
@ -647,7 +647,7 @@ def decampos_digits(data_set='decampos_characters', which_digits=[0,1,2,3,4,5,6,
lbls = np.array([[l]*num_samples for l in which_digits]).reshape(Y.shape[0], 1)
str_lbls = np.array([[str(l)]*num_samples for l in which_digits])
return data_details_return({'Y': Y, 'lbls': lbls, 'str_lbls' : str_lbls, 'info': 'Digits data set from the de Campos characters data'}, data_set)
def ripley_synth(data_set='ripley_prnn_data'):
if not data_available(data_set):
download_data(data_set)
@ -674,7 +674,7 @@ def mauna_loa(data_set='mauna_loa', num_train=545, refresh_data=False):
Y = allY[:num_train, 0:1]
Ytest = allY[num_train:, 0:1]
return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "Mauna Loa data with " + str(num_train) + " values used as training points."}, data_set)
def boxjenkins_airline(data_set='boxjenkins_airline', num_train=96):
path = os.path.join(data_path, data_set)
@ -686,7 +686,7 @@ def boxjenkins_airline(data_set='boxjenkins_airline', num_train=96):
Xtest = data[num_train:, 0:1]
Ytest = data[num_train:, 1:2]
return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "Montly airline passenger data from Box & Jenkins 1976."}, data_set)
def osu_run1(data_set='osu_run1', sample_every=4):
path = os.path.join(data_path, data_set)
@ -725,7 +725,7 @@ def hapmap3(data_set='hapmap3'):
\ -1, iff SNPij==(B2,B2)
The SNP data and the meta information (such as iid, sex and phenotype) are
stored in the dataframe datadf, index is the Individual ID,
stored in the dataframe datadf, index is the Individual ID,
with following columns for metainfo:
* family_id -> Family ID
@ -798,7 +798,7 @@ def hapmap3(data_set='hapmap3'):
status=write_status('unpacking...', curr, status)
os.remove(filepath)
status=write_status('reading .ped...', 25, status)
# Preprocess data:
# Preprocess data:
snpstrnp = np.loadtxt(unpacked_files[0], dtype=str)
status=write_status('reading .map...', 33, status)
mapnp = np.loadtxt(unpacked_files[1], dtype=str)
@ -959,7 +959,7 @@ def olivetti_glasses(data_set='olivetti_glasses', num_training=200, seed=default
Y = y[index[:num_training],:]
Ytest = y[index[num_training:]]
return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'seed' : seed, 'info': "ORL Faces with labels identifiying who is wearing glasses and who isn't. Data is randomly partitioned according to given seed. Presence or absence of glasses was labelled by James Hensman."}, 'olivetti_faces')
def olivetti_faces(data_set='olivetti_faces'):
path = os.path.join(data_path, data_set)
if not data_available(data_set):
@ -972,7 +972,8 @@ def olivetti_faces(data_set='olivetti_faces'):
for subject in range(40):
for image in range(10):
image_path = os.path.join(path, 'orl_faces', 's'+str(subject+1), str(image+1) + '.pgm')
Y.append(GPy.util.netpbmfile.imread(image_path).flatten())
from GPy.util import netpbmfile
Y.append(netpbmfile.imread(image_path).flatten())
lbls.append(subject)
Y = np.asarray(Y)
lbls = np.asarray(lbls)[:, None]
@ -1195,7 +1196,7 @@ def cifar10_patches(data_set='cifar-10'):
for x in range(0,32-5,5):
for y in range(0,32-5,5):
patches = np.concatenate((patches, images[:,x:x+5,y:y+5,:]), axis=0)
patches = patches.reshape((patches.shape[0],-1))
patches = patches.reshape((patches.shape[0],-1))
return data_details_return({'Y': patches, "info" : "32x32 pixel patches extracted from the CIFAR-10 data by Boris Babenko to demonstrate k-means features."}, data_set)
def cmu_mocap_49_balance(data_set='cmu_mocap'):