diff --git a/GPy/gpy_config.cfg b/GPy/gpy_config.cfg index db90dbf6..43cd0ebe 100644 --- a/GPy/gpy_config.cfg +++ b/GPy/gpy_config.cfg @@ -6,6 +6,10 @@ # some platforms, hence this option. openmp=False +[datasets] +# location for the local data cache +dir=$HOME/tmp/GPy-datasets/ + [anaconda] # if you have an anaconda python installation please specify it here. installed = False diff --git a/GPy/likelihoods/ordinal.py b/GPy/likelihoods/ordinal.py new file mode 100644 index 00000000..4ac204fd --- /dev/null +++ b/GPy/likelihoods/ordinal.py @@ -0,0 +1,48 @@ +# Copyright (c) 2014 The GPy authors (see AUTHORS.txt) +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import sympy as sym +from GPy.util.symbolic import gammaln, normcdfln, normcdf, IndMatrix, create_matrix +import numpy as np +from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf +import link_functions +from symbolic import Symbolic +from scipy import stats + +class Ordinal(Symbolic): + """ + Ordinal + + .. math:: + p(y_{i}|\pi(f_{i})) = \left(\frac{r}{r+f_i}\right)^r \frac{\Gamma(r+y_i)}{y!\Gamma(r)}\left(\frac{f_i}{r+f_i}\right)^{y_i} + + .. Note:: + Y takes non zero integer values.. + link function should have a positive domain, e.g. log (default). + + .. See also:: + symbolic.py, for the parent class + """ + def __init__(self, categories=3, gp_link=None): + if gp_link is None: + gp_link = link_functions.Identity() + + dispersion = sym.Symbol('width', positive=True, real=True) + y_0 = sym.Symbol('y_0', nonnegative=True, integer=True) + f_0 = sym.Symbol('f_0', positive=True, real=True) + log_pdf = create_matrix('log_pdf', 1, categories) + log_pdf[0] = normcdfln(-f_0) + if categories>2: + w = create_matrix('w', 1, categories) + log_pdf[categories-1] = normcdfln(w.sum() + f_0) + for i in range(1, categories-1): + log_pdf[i] = sym.log(normcdf(w[0, 0:i-1].sum() + f_0) - normcdf(w[0, 0:i].sum()-f_0) ) + else: + log_pdf[1] = normcdfln(f_0) + log_pdf.index_var = y_0 + super(Ordinal, self).__init__(log_pdf=log_pdf, gp_link=gp_link, name='Ordinal') + + # TODO: Check this. + self.log_concave = True + diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py index 71e08c6b..f8413671 100644 --- a/GPy/plotting/matplot_dep/dim_reduction_plots.py +++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py @@ -97,7 +97,7 @@ def plot_latent(model, labels=None, which_indices=None, elif type(ul) is np.int64: this_label = 'class %i' % ul else: - this_label = unicode(i) + this_label = unicode(ul) m = marker.next() index = np.nonzero(labels == ul)[0] diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index 6cc692e8..d6640295 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -467,6 +467,21 @@ "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/singlecell/" ] }, + "sod1_mouse": { + "citation": "Transcriptomic indices of fast and slow disease progression in two mouse models of amyotrophic lateral sclerosis' Nardo G1, Iennaco R, Fusi N, Heath PR, Marino M, Trolese MC, Ferraiuolo L, Lawrence N, Shaw PJ, Bendotti C Brain. 2013 Nov;136(Pt 11):3305-32. doi: 10.1093/brain/awt250. Epub 2013 Sep 24.", + "details": "Gene expression data from two separate strains of mice: C57 and 129Sv in wild type and SOD1 mutant strains.", + "files": [ + [ + "sod1_C59_129_exprs.csv", + "sod1_C59_129_se.csv" + ] + ], + "license": null, + "size": 0, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/singlecell/sod1_mouse/" + ] + }, "swiss_roll": { "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index c18431ef..05e4013e 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -12,6 +12,8 @@ import datetime import json import re +from config import * + ipython_available=True try: import IPython @@ -29,7 +31,8 @@ def reporthook(a,b,c): sys.stdout.flush() # Global variables -data_path = os.path.join(os.path.dirname(__file__), 'datasets') +data_path = os.path.expandvar(config.get('datasets', 'dir')) +#data_path = os.path.join(os.path.dirname(__file__), 'datasets') default_seed = 10000 overide_manual_authorize=False neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/' @@ -360,11 +363,25 @@ def football_data(season='1314', data_set='football_data'): Y = table[:, 4:] return data_details_return({'X': X, 'Y': Y}, data_set) +def sod1_mouse(data_set='sod1_mouse'): + if not data_available(data_set): + download_data(data_set) + from pandas import read_csv + dirpath = os.path.join(data_path, data_set) + filename = os.path.join(dirpath, 'sod1_C57_129_exprs.csv') + Y = read_csv(filename, header=0, index_col=0).T + num_repeats=4 + num_time=4 + num_cond=4 + X = 1 + return data_details_return({'X': X, 'Y': Y}, data_set) + def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None): if not data_available(data_set): download_data(data_set) from pandas import read_csv - filename = os.path.join(data_path, 'tomancak_expr.csv') + dirpath = os.path.join(data_path, data_set) + filename = os.path.join(dirpath, 'tomancak_expr.csv') Y = read_csv(filename, header=0, index_col=0).T num_repeats = 3 num_time = 12