Merge branch 'devel' of github.com:SheffieldML/GPy into devel

2026-05-10 12:32:40 +02:00 · 2015-08-17 16:06:47 +01:00 · 2015-08-17 16:06:47 +01:00 · 65b6e54d5c
commit 65b6e54d5c
parent 070137504a e144463ce7
42 changed files with 10282 additions and 7712 deletions
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -60,9 +60,11 @@ class GP(Model):
            self.normalizer.scale_by(Y)
            self.Y_normalized = ObsAr(self.normalizer.normalize(Y))
            self.Y = Y
-        else:
+        elif isinstance(Y, np.ndarray):
            self.Y = ObsAr(Y)
            self.Y_normalized = self.Y
        else:
            self.Y = Y
        if Y.shape[0] != self.num_data:
            #There can be cases where we want inputs than outputs, for example if we have multiple latent
@ -473,16 +475,16 @@ class GP(Model):
            self.inference_method.on_optimization_end()
            raise
-    def infer_newX(self, Y_new, optimize=True, ):
+    def infer_newX(self, Y_new, optimize=True):
        """
-        Infer the distribution of X for the new observed data *Y_new*.
+        Infer X for the new observed data *Y_new*.
        :param Y_new: the new observed data for inference
        :type Y_new: numpy.ndarray
        :param optimize: whether to optimize the location of new X (True by default)
        :type optimize: boolean
        :return: a tuple containing the posterior estimation of X and the model that optimize X
-        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` or numpy.ndarray, :class:`~GPy.core.model.Model`)
+        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` and numpy.ndarray, :class:`~GPy.core.model.Model`)
        """
        from ..inference.latent_function_inference.inferenceX import infer_newX
        return infer_newX(self, Y_new, optimize=optimize)
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -257,7 +257,7 @@ class Model(Parameterized):
            optimizer = optimization.get_optimizer(optimizer)
            opt = optimizer(start, model=self, max_iters=max_iters, **kwargs)
-        with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook) as vo:
+        with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook, clear_after_finish=clear_after_finish) as vo:
            opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
            vo.finish(opt)
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@ -109,7 +109,7 @@ class ParameterIndexOperations(object):
        try:
            return self._properties.itervalues()
        except AttributeError:
-	#Changed this from itervalues to values for Py3 compatibility. It didn't break the test suite.
+            #Changed this from itervalues to values for Py3 compatibility. It didn't break the test suite.
            return self._properties.values()
    def indices(self):
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -197,9 +197,10 @@ class Parameterized(Parameterizable):
                raise RuntimeError("{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param)))
        start = sum([p.size for p in self.parameters[:param._parent_index_]])
        self._remove_parameter_name(param)
        self.size -= param.size
        del self.parameters[param._parent_index_]
        self._remove_parameter_name(param)
        param._disconnect_parent()
        param.remove_observer(self, self._pass_through_notify_observers)
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -522,16 +522,9 @@ class DGPLVM(Prior):
    """
    domain = _REAL
-    # _instances = []
+    
-    # def __new__(cls, mu, sigma): # Singleton:
+    def __new__(cls, sigma2, lbl, x_shape): 
-    #     if cls._instances:
+        return super(Prior, cls).__new__(cls, sigma2, lbl, x_shape)
    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
    #         for instance in cls._instances:
    #             if instance().mu == mu and instance().sigma == sigma:
    #                 return instance()
    #     o = super(Prior, cls).__new__(cls, mu, sigma)
    #     cls._instances.append(weakref.ref(o))
    #     return cls._instances[-1]()
    def __init__(self, sigma2, lbl, x_shape):
        self.sigma2 = sigma2
@ -843,7 +836,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
    # Calculating beta and Bi for Sb
    def compute_sig_beta_Bi(self, data_idx, M_i, M_0, lst_idx_all):
-        import pdb
+        # import pdb
        # pdb.set_trace()
        B_i = np.zeros((self.classnum, self.dim))
        Sig_beta_B_i_all = np.zeros((self.datanum, self.dim))
@ -909,8 +902,8 @@ class DGPLVM_Lamda(Prior, Parameterized):
        Sw = self.compute_Sw(cls, M_i)
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
    # This function calculates derivative of the log of prior function
@ -933,8 +926,8 @@ class DGPLVM_Lamda(Prior, Parameterized):
        # Calculating inverse of Sb and its transpose and minus
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@ -161,6 +161,16 @@ class NormalPosterior(VariationalPosterior):
        from ...plotting.matplot_dep import variational_plots
        return variational_plots.plot(self, *args, **kwargs)
    def KL(self, other):
        """Compute the KL divergence to another NormalPosterior Object. This only holds, if the two NormalPosterior objects have the same shape, as we do computational tricks for the multivariate normal KL divergence.
        """
        return .5*(
            np.sum(self.variance/other.variance) 
            + ((other.mean-self.mean)**2/other.variance).sum() 
            - self.num_data * self.input_dim
            + np.sum(np.log(other.variance)) - np.sum(np.log(self.variance))
            )
 class SpikeAndSlabPosterior(VariationalPosterior):
    '''
    The SpikeAndSlab distribution for variational approximations.
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@ -34,7 +34,7 @@ class SparseGP_MPI(SparseGP):
    """
-    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False):
+    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp', Y_metadata=None, mpi_comm=None, normalizer=False):
        self._IN_OPTIMIZATION_ = False
        if mpi_comm != None:
            if inference_method is None:
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@ -5,7 +5,7 @@ import numpy as np
 from ..util import choleskies
 from .sparse_gp import SparseGP
 from .parameterization.param import Param
-from ..inference.latent_function_inference import SVGP as svgp_inf
+from ..inference.latent_function_inference.svgp import SVGP as svgp_inf
 class SVGP(SparseGP):
--- a/GPy/inference/latent_function_inference/init.py
+++ b/GPy/inference/latent_function_inference/init.py
@ -69,7 +69,7 @@ from .expectation_propagation_dtc import EPDTC
 from .dtc import DTC
 from .fitc import FITC
 from .var_dtc_parallel import VarDTC_minibatch
-from .svgp import SVGP
+#from .svgp import SVGP
 # class FullLatentFunctionData(object):
 #
--- a/GPy/inference/latent_function_inference/inferenceX.py
+++ b/GPy/inference/latent_function_inference/inferenceX.py
@ -27,12 +27,19 @@ def infer_newX(model, Y_new, optimize=True, init='L2'):
 class InferenceX(Model):
    """
-    The class for inference of new X with given new Y. (do_test_latent)
+    The model class for inference of new X with given new Y. (replacing the "do_test_latent" in Bayesian GPLVM)
    It is a tiny inference model created from the original GP model. The kernel, likelihood (only Gaussian is supported at the moment) 
    and posterior distribution are taken from the original model.
    For Regression models and GPLVM, a point estimate of the latent variable X will be inferred. 
    For Bayesian GPLVM, the variational posterior of X will be inferred. 
    X is inferred through a gradient optimization of the inference model.
    :param model: the GPy model used in inference
    :type model: GPy.core.Model
    :param Y: the new observed data for inference
    :type Y: numpy.ndarray
    :param init: the distance metric of Y for initializing X with the nearest neighbour.
    :type init: 'L2', 'NCC' and 'rand'
    """
    def __init__(self, model, Y, name='inferenceX', init='L2'):
        if np.isnan(Y).any() or getattr(model, 'missing_data', False):
@ -59,7 +66,7 @@ class InferenceX(Model):
            from ...models.ss_mrd import IBPPrior_SSMRD
            if isinstance(model.variational_prior, IBPPrior) or isinstance(model.variational_prior, IBPPrior_SSMRD):
                from ...core.parameterization.variational import SpikeAndSlabPrior
-                self.variational_prior = SpikeAndSlabPrior(pi=05,learnPi=False, group_spike=False)
+                self.variational_prior = SpikeAndSlabPrior(pi=0.5, learnPi=False, group_spike=False)
            else:
                self.variational_prior = model.variational_prior.copy()
        else:
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@ -5,6 +5,10 @@ class StochasticStorage(object):
    '''
    This is a container for holding the stochastic parameters,
    such as subset indices or step length and so on.
    self.d has to be a list of lists:
    [dimension indices, nan indices for those dimensions]
    so that the minibatches can be used as efficiently as possible.10
    '''
    def __init__(self, model):
        """
@ -28,9 +32,23 @@ class SparseGPMissing(StochasticStorage):
        """
        Here we want to loop over all dimensions everytime.
        Thus, we can just make sure the loop goes over self.d every
-        time.
+        time. We will try to get batches which look the same together
        which speeds up calculations significantly.
        """
-        self.d = range(model.Y_normalized.shape[1])
+        import numpy as np
        self.Y = model.Y_normalized
        bdict = {}
        for d in range(self.Y.shape[1]):
            inan = np.isnan(self.Y[:, d])
            arr_str = np.array2string(inan, 
                                      np.inf, 0, 
                                      True, '', 
                                      formatter={'bool':lambda x: '1' if x else '0'})
            try:
                bdict[arr_str][0].append(d)
            except:
                bdict[arr_str] = [[d], ~inan]
        self.d = bdict.values()
 class SparseGPStochastics(StochasticStorage):
    """
@ -40,16 +58,29 @@ class SparseGPStochastics(StochasticStorage):
    def __init__(self, model, batchsize=1):
        self.batchsize = batchsize
        self.output_dim = model.Y.shape[1]
        self.Y = model.Y_normalized
        self.reset()
        self.do_stochastics()
    def do_stochastics(self):
        if self.batchsize == 1:
            self.current_dim = (self.current_dim+1)%self.output_dim
-            self.d = [self.current_dim]
+            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.d])]]
        else:
            import numpy as np
            self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
            bdict = {}
            for d in self.d:
                inan = np.isnan(self.Y[:, d])
                arr_str = int(np.array2string(inan, 
                                          np.inf, 0, 
                                          True, '', 
                                          formatter={'bool':lambda x: '1' if x else '0'}), 2)
                try:
                    bdict[arr_str][0].append(d)
                except:
                    bdict[arr_str] = [[d], ~inan]
            self.d = bdict.values()
    def reset(self):
        self.current_dim = -1
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -6,6 +6,7 @@ from ._src.brownian import Brownian
 from ._src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from ._src.mlp import MLP
 from ._src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
 from ._src.standard_periodic import StdPeriodic
 from ._src.independent_outputs import IndependentOutputs, Hierarchical
 from ._src.coregionalize import Coregionalize
 from ._src.ODE_UY import ODE_UY
@ -17,7 +18,7 @@ from ._src.eq_ode2 import EQ_ODE2
 from ._src.trunclinear import TruncLinear,TruncLinear_inf
 from ._src.splitKern import SplitKern,DEtime
 from ._src.splitKern import DEtime as DiffGenomeKern
-
+from ._src.spline import Spline
-
+from ._src.eq_ode2 import EQ_ODE2
 from ._src.basis_funcs import LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
--- a/GPy/kern/_src/coregionalize_cython.c
+++ b/GPy/kern/_src/coregionalize_cython.c
--- a/GPy/kern/_src/coregionalize_cython.pyx
+++ b/GPy/kern/_src/coregionalize_cython.pyx
@ -1,33 +1,37 @@
-#cython: boundscheck=True
+#cython: boundscheck=False
-#cython: wraparound=True
+#cython: wraparound=False
 #cython: nonecheck=False
 import cython
 import numpy as np
 cimport numpy as np
 def K_symmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X):
    cdef int N = X.size
-    cdef np.ndarray[np.double_t, ndim=2] K = np.empty((N, N))
+    cdef np.ndarray[np.double_t, ndim=2, mode='c'] K = np.empty((N, N))
-    for n in range(N):
+    with nogil:
-        for m in range(N):
+        for n in range(N):
-            K[n,m] = B[X[n],X[m]]
+            for m in range(N):
                K[n, m] = B[X[n], X[m]]
    return K
 def K_asymmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X, np.ndarray[np.int64_t, ndim=1] X2):
    cdef int N = X.size
    cdef int M = X2.size
-    cdef np.ndarray[np.double_t, ndim=2] K = np.empty((N, M))
+    cdef np.ndarray[np.double_t, ndim=2, mode='c'] K = np.empty((N, M))
-    for n in range(N):
+    with nogil:
-        for m in range(M):
+        for n in range(N):
-            K[n,m] = B[X[n],X2[m]]
+            for m in range(M):
                K[n, m] = B[X[n], X2[m]]
    return K
 def gradient_reduce(int D, np.ndarray[double, ndim=2] dL_dK, np.ndarray[np.int64_t, ndim=1] index, np.ndarray[np.int64_t, ndim=1] index2):
-        cdef np.ndarray[np.double_t, ndim=2] dL_dK_small = np.zeros((D, D))
+        cdef np.ndarray[np.double_t, ndim=2, mode='c'] dL_dK_small = np.zeros((D, D))
        cdef int N = index.size
        cdef int M = index2.size
-        for i in range(N):
+        with nogil:
-            for j in range(M):
+            for i in range(N):
-                dL_dK_small[index2[j],index[i]] += dL_dK[i,j];
+                for j in range(M):
                    dL_dK_small[index2[j],index[i]] += dL_dK[i,j];
        return dL_dK_small
--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@ -55,16 +55,15 @@ def __psi2computations(variance, lengthscale, Z, mu, S):
    # Produced intermediate results:
    # _psi2                MxM
    N,M,Q = mu.shape[0], Z.shape[0], mu.shape[1]
    lengthscale2 = np.square(lengthscale)
    _psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N
    _psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM
    Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ
    denom = 1./(2.*S+lengthscale2)
-    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom)
+    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+(2*(mu*denom).dot(Z_hat.reshape(M*M,Q).T) - denom.dot(np.square(Z_hat).reshape(M*M,Q).T)).reshape(N,M,M)
    _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
    return _psi2
 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
@ -157,5 +156,5 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
-_psi1computations = Cacher(__psi1computations, limit=1)
+_psi1computations = Cacher(__psi1computations, limit=5)
-_psi2computations = Cacher(__psi2computations, limit=1)
+_psi2computations = Cacher(__psi2computations, limit=5)
--- a/GPy/kern/_src/spline.py
+++ b/GPy/kern/_src/spline.py
@ -0,0 +1,52 @@
 # Copyright (c) 2015, Thomas Hornung
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 class Spline(Kern):
    """
    Linear spline kernel. You need to specify 2 parameters: the variance and c.
    The variance is defined in powers of 10. Thus specifying -2 means 10^-2.
    The parameter c allows to define the stiffness of the spline fit. A very stiff
    spline equals linear regression.
    See https://www.youtube.com/watch?v=50Vgw11qn0o starting at minute 1:17:28
    Lit: Wahba, 1990
    """
    def __init__(self, input_dim, variance=1., c=1., active_dims=None, name='spline'):
        super(Spline, self).__init__(input_dim, active_dims, name)
        self.variance = Param('variance', variance, Logexp())
        self.c = Param('c', c)
        self.link_parameters(self.variance,self.c)
    def K(self, X, X2=None):
        if X2 is None: X2=X
        term1 = (X+8.)*(X2.T+8.)/16.
        term2 = abs((X-X2.T)/16.)**3
        term3 = ((X+8.)/16.)**3 + ((X2.T+8.)/16.)**3
        return (self.variance**2 * (1. + (1.+self.c) * term1 + self.c/3. * (term2 - term3)))
    def Kdiag(self, X):
        term1 = np.square(X+8.,X+8.)/16.
        term3 = 2. * ((X+8.)/16.)**3
        return (self.variance**2 * (1. + (1.+self.c) * term1 - self.c/3. * term3))[:,0]
    def update_gradients_full(self, dL_dK, X, X2=None):
        if X2 is None: X2=X
        term1 = (X+8.)*(X2.T+8.)/16.
        term2 = abs((X-X2.T)/16.)**3
        term3 = ((X+8.)/16.)**3 + ((X2.T+8.)/16.)**3
        self.variance.gradient = np.sum(dL_dK * (2*self.variance * (1. + (1.+self.c) * term1 + self.c/3. * ( term2 - term3))))
        self.c.gradient = np.sum(dL_dK * (self.variance**2* (term1 + 1./3.*(term2 - term3))))
    def update_gradients_diag(self, dL_dKdiag, X):
        raise NotImplementedError
    def gradients_X(self, dL_dK, X, X2=None):
        raise NotImplementedError
    def gradients_X_diag(self, dL_dKdiag, X):
        raise NotImplementedError
--- a/GPy/kern/_src/standard_periodic.py
+++ b/GPy/kern/_src/standard_periodic.py
@ -0,0 +1,166 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 The standard periodic kernel which mentioned in:
 [1] Gaussian Processes for Machine Learning, C. E. Rasmussen, C. K. I. Williams.
 The MIT Press, 2005.
 [2] Introduction to Gaussian processes. D. J. C. MacKay. In C. M. Bishop, editor, 
 Neural Networks and Machine Learning, pages 133-165. Springer, 1998.
 """
 from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
 class StdPeriodic(Kern):
    """
    Standart periodic kernel
    .. math::
       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance: the variance :math:`\theta_1` in the formula above
    :type variance: float
    :param wavelength: the vector of wavelengths :math:`\lambda_i`. If None then 1.0 is assumed.
    :type wavelength: array or list of the appropriate size (or float if there is only one wavelength parameter)
    :param lengthscale: the vector of lengthscale :math:`\l_i`. If None then 1.0 is assumed.
    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
    :param ARD1: Auto Relevance Determination with respect to wavelength. 
        If equal to "False" one single wavelength parameter :math:`\lambda_i` for 
        each dimension is assumed, otherwise there is one lengthscale 
        parameter per dimension.
    :type ARD1: Boolean
    :param ARD2: Auto Relevance Determination with respect to lengthscale. 
        If equal to "False" one single wavelength parameter :math:`l_i` for 
        each dimension is assumed, otherwise there is one lengthscale 
        parameter per dimension.
    :type ARD2: Boolean
    :param active_dims: indices of dimensions which are used in the computation of the kernel
    :type wavelength: array or list of the appropriate size
    :param name: Name of the kernel for output
    :type String
    :param useGPU: whether of not use GPU
    :type Boolean
    """
    def __init__(self, input_dim, variance=1., wavelength=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
        super(StdPeriodic, self).__init__(input_dim, active_dims, name, useGPU=useGPU)
        self.input_dim = input_dim
        self.ARD1 = ARD1 # correspond to wavelengths        
        self.ARD2 = ARD2 # correspond to lengthscales
        self.name = name
        if self.ARD1 == False:
            if wavelength is not None:
                wavelength = np.asarray(wavelength)
                assert wavelength.size == 1, "Only one wavelength needed for non-ARD kernel"
            else:
                wavelength = np.ones(1)
        else:
            if wavelength is not None:
                wavelength = np.asarray(wavelength)
                assert wavelength.size == input_dim, "bad number of wavelengths"
            else:
                wavelength = np.ones(input_dim)
        if self.ARD2 == False:
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
            else:
                lengthscale = np.ones(1)
        else:
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == input_dim, "bad number of lengthscales"
            else:
                lengthscale = np.ones(input_dim)
        self.variance = Param('variance', variance, Logexp())
        assert self.variance.size==1, "Variance size must be one"
        self.wavelengths =  Param('wavelengths', wavelength, Logexp())
        self.lengthscales =  Param('lengthscales', lengthscale, Logexp())
        self.link_parameters(self.variance,  self.wavelengths, self.lengthscales)
    def parameters_changed(self):
        """
        This functions deals as a callback for each optimization iteration. 
        If one optimization step was successfull and the parameters
        this callback function will be called to be able to update any 
        precomputations for the kernel.
        """
        pass
    def K(self, X, X2=None):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: 
            X2 = X
        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscales ), axis = -1 ) ) 
        return self.variance * exp_dist
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix associated to X."""
        ret = np.empty(X.shape[0])
        ret[:] = self.variance
        return ret
    def update_gradients_full(self, dL_dK, X, X2=None):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: 
            X2 = X
        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
        sin_base = np.sin( base )         
        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscales ), axis = -1 ) ) 
        dwl = self.variance * (1.0/np.square(self.lengthscales)) * sin_base*np.cos(base) * (base / self.wavelengths)
        dl = self.variance * np.square( sin_base) / np.power( self.lengthscales, 3) 
        self.variance.gradient = np.sum(exp_dist * dL_dK)    
        #target[0] += np.sum( exp_dist * dL_dK)        
        if self.ARD1: # different wavelengths
            self.wavelengths.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
        else:  # same wavelengths
            self.wavelengths.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
        if self.ARD2: # different lengthscales
            self.lengthscales.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
        else: # same lengthscales
            self.lengthscales.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
    def update_gradients_diag(self, dL_dKdiag, X):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        self.variance.gradient = np.sum(dL_dKdiag)
        self.wavelengths.gradient = 0
        self.lengthscales.gradient = 0
 #    def gradients_X(self, dL_dK, X, X2=None):
 #        """derivative of the covariance matrix with respect to X."""
 #    
 #        raise NotImplemented("Periodic kernel: dK_dX not implemented")
 #
 #    def gradients_X_diag(self, dL_dKdiag, X):
 #        
 #        raise NotImplemented("Periodic kernel: dKdiag_dX not implemented")
--- a/GPy/kern/_src/stationary_cython.c
+++ b/GPy/kern/_src/stationary_cython.c
--- a/GPy/kern/_src/stationary_cython.pyx
+++ b/GPy/kern/_src/stationary_cython.pyx
@ -4,14 +4,15 @@
 import numpy as np
 cimport numpy as np
 from cython.parallel import prange
 cimport cython
 ctypedef np.float64_t DTYPE_t
 cdef extern from "stationary_utils.h":
-    void _grad_X "_grad_X" (int N, int D, int M, double* X, double* X2, double* tmp, double* grad)
+    void _grad_X "_grad_X" (int N, int D, int M, double* X, double* X2, double* tmp, double* grad) nogil
 cdef extern from "stationary_utils.h":
-    void _lengthscale_grads "_lengthscale_grads" (int N, int M, int Q, double* tmp, double* X, double* X2, double* grad)
+    void _lengthscale_grads "_lengthscale_grads" (int N, int M, int Q, double* tmp, double* X, double* X2, double* grad) nogil
 def grad_X(int N, int D, int M,
        np.ndarray[DTYPE_t, ndim=2] _X,
@ -22,18 +23,18 @@ def grad_X(int N, int D, int M,
    cdef double *X2 = <double*> _X2.data
    cdef double *tmp = <double*> _tmp.data
    cdef double *grad = <double*> _grad.data
-    _grad_X(N, D, M, X, X2, tmp, grad) # return nothing, work in place.
+    with nogil:
        _grad_X(N, D, M, X, X2, tmp, grad) # return nothing, work in place.
@cython.cdivision(True)
 def grad_X_cython(int N, int D, int M, double[:,:] X, double[:,:] X2, double[:,:] tmp, double[:,:] grad):
    cdef int n,d,nd,m
-    for nd in prange(N*D, nogil=True):
+    for nd in prange(N * D, nogil=True):
-        n = nd/D
+        n = nd / D
-        d = nd%D
+        d = nd % D
        grad[n,d] = 0.0
        for m in range(M):
-            grad[n,d] += tmp[n,m]*(X[n,d]-X2[m,d])
+            grad[n,d] += tmp[n, m] * (X[n, d] - X2[m, d])
 def lengthscale_grads_in_c(int N, int M, int Q,
        np.ndarray[DTYPE_t, ndim=2] _tmp,
@ -44,16 +45,16 @@ def lengthscale_grads_in_c(int N, int M, int Q,
    cdef double *X = <double*> _X.data
    cdef double *X2 = <double*> _X2.data
    cdef double *grad = <double*> _grad.data
-    _lengthscale_grads(N, M, Q, tmp, X, X2, grad) # return nothing, work in place.
+    with nogil:
        _lengthscale_grads(N, M, Q, tmp, X, X2, grad) # return nothing, work in place.
 def lengthscale_grads(int N, int M, int Q, double[:,:] tmp, double[:,:] X, double[:,:] X2, double[:] grad):
    cdef int q, n, m
    cdef double gradq, dist
-    for q in range(Q):
+    with nogil:
-        grad[q] = 0.0
+        for q in range(Q):
-        for n in range(N):
+            grad[q] = 0.0
-            for m in range(M):
+            for n in range(N):
-                dist = X[n,q] - X2[m,q]
+                for m in range(M):
-                grad[q] += tmp[n,m]*dist*dist
+                    dist = X[n,q] - X2[m,q]
-
+                    grad[q] += tmp[n, m] * dist * dist
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@ -232,6 +232,17 @@ class Bernoulli(Likelihood):
        np.seterr(**state)
        return d3logpdf_dlink3
    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
        """
        Get the "quantiles" of the binary labels (Bernoulli draws). all the
        quantiles must be either 0 or 1, since those are the only values the
        draw can take!
        """
        p = self.predictive_mean(mu, var)
        return [np.asarray(p>(q/100.), dtype=np.int32) for q in quantiles]
    def samples(self, gp, Y_metadata=None):
        """
        Returns a set of samples of observations based on a given value of the latent variable.
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -607,7 +607,7 @@ class Likelihood(Parameterized):
            pred_mean = self.predictive_mean(mu, var, Y_metadata=Y_metadata)
            pred_var = self.predictive_variance(mu, var, pred_mean, Y_metadata=Y_metadata)
        except NotImplementedError:
-            print "Finding predictive mean and variance via sampling rather than quadrature"
+            print("Finding predictive mean and variance via sampling rather than quadrature")
            Nf_samp = 300
            Ny_samp = 1
            s = np.random.randn(mu.shape[0], Nf_samp)*np.sqrt(var) + mu
--- a/GPy/likelihoods/poisson.py
+++ b/GPy/likelihoods/poisson.py
@ -137,7 +137,7 @@ class Poisson(Likelihood):
        """
        return self.gp_link.transf(gp)
-    def samples(self, gp, Y_metadata=None):
+    def samples(self, gp, Y_metadata=None, samples=1):
        """
        Returns a set of samples of observations based on a given value of the latent variable.
@ -145,5 +145,5 @@ class Poisson(Likelihood):
        """
        orig_shape = gp.shape
        gp = gp.flatten()
-        Ysim = np.random.poisson(self.gp_link.transf(gp))
+        Ysim = np.random.poisson(self.gp_link.transf(gp), [samples, gp.size]).T
-        return Ysim.reshape(orig_shape)
+        return Ysim.reshape(orig_shape+(samples,))
--- a/GPy/models/bcgplvm.py
+++ b/GPy/models/bcgplvm.py
@ -1,11 +1,11 @@
-# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015 James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core import GP
-from ..models import GPLVM
+from . import GPLVM
-from ..mappings import *
+from .. import mappings
 class BCGPLVM(GPLVM):
@ -16,33 +16,31 @@ class BCGPLVM(GPLVM):
    :type Y: np.ndarray
    :param input_dim: latent dimensionality
    :type input_dim: int
    :param init: initialisation method for the latent space
    :type init: 'PCA'|'random'
    :param mapping: mapping for back constraint
    :type mapping: GPy.core.Mapping object
    """
-    def __init__(self, Y, input_dim, init='PCA', X=None, kernel=None, normalize_Y=False, mapping=None):
+    def __init__(self, Y, input_dim, kernel=None, mapping=None):
-        
+
        if mapping is None:
-            mapping = Kernel(X=Y, output_dim=input_dim)
+            mapping = mappings.MLP(input_dim=Y.shape[1],
                                   output_dim=input_dim,
                                   hidden_dim=10)
        else:
            assert mapping.input_dim==Y.shape[1], "mapping input dim does not work for Y dimension"
            assert mapping.output_dim==input_dim, "mapping output dim does not work for self.input_dim"
        GPLVM.__init__(self, Y, input_dim, X=mapping.f(Y), kernel=kernel, name="bcgplvm")
        self.unlink_parameter(self.X)
        self.mapping = mapping
-        GPLVM.__init__(self, Y, input_dim, init, X, kernel, normalize_Y)
+        self.link_parameter(self.mapping)
        self.X = self.mapping.f(self.likelihood.Y)
-    def _get_param_names(self):
+        self.X = self.mapping.f(self.Y)
        return self.mapping._get_param_names() + GP._get_param_names(self)
-    def _get_params(self):
+    def parameters_changed(self):
-        return np.hstack((self.mapping._get_params(), GP._get_params(self)))
+        self.X = self.mapping.f(self.Y)
        GP.parameters_changed(self)
        Xgradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
        self.mapping.update_gradients(Xgradient, self.Y)
    def _set_params(self, x):
        self.mapping._set_params(x[:self.mapping.num_params])
        self.X = self.mapping.f(self.likelihood.Y)
        GP._set_params(self, x[self.mapping.num_params:])
    def _log_likelihood_gradients(self):
        dL_df = self.kern.gradients_X(self.dL_dK, self.X)
        dL_dtheta = self.mapping.df_dtheta(dL_df, self.likelihood.Y)
        return np.hstack((dL_dtheta.flatten(), GP._log_likelihood_gradients(self)))
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -58,12 +58,15 @@ class GPLVM(GP):
        return target
    def plot(self):
-        assert self.likelihood.Y.shape[1] == 2
+        assert self.Y.shape[1] == 2, "too high dimensional to plot. Try plot_latent"
-        pb.scatter(self.likelihood.Y[:, 0], self.likelihood.Y[:, 1], 40, self.X[:, 0].copy(), linewidth=0, cmap=pb.cm.jet)  # @UndefinedVariable
+        from matplotlib import pyplot as plt
        plt.scatter(self.Y[:, 0],
                    self.Y[:, 1],
                    40, self.X[:, 0].copy(),
                    linewidth=0, cmap=plt.cm.jet)
        Xnew = np.linspace(self.X.min(), self.X.max(), 200)[:, None]
        mu, _ = self.predict(Xnew)
-        import pylab as pb
+        plt.plot(mu[:, 0], mu[:, 1], 'k', linewidth=1.5)
        pb.plot(mu[:, 0], mu[:, 1], 'k', linewidth=1.5)
    def plot_latent(self, labels=None, which_indices=None,
                resolution=50, ax=None, marker='o', s=40,
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@ -63,33 +63,18 @@ class SparseGPMiniBatch(SparseGP):
        if stochastic and missing_data:
            self.missing_data = True
            self.ninan = ~np.isnan(Y)
            self.stochastics = SparseGPStochastics(self, batchsize)
        elif stochastic and not missing_data:
            self.missing_data = False
            self.stochastics = SparseGPStochastics(self, batchsize)
        elif missing_data:
            self.missing_data = True
            self.ninan = ~np.isnan(Y)
            self.stochastics = SparseGPMissing(self)
        else:
            self.stochastics = False
        logger.info("Adding Z as parameter")
        self.link_parameter(self.Z, index=0)
        if self.missing_data:
            self.Ylist = []
            overall = self.Y_normalized.shape[1]
            m_f = lambda i: "Precomputing Y for missing data: {: >7.2%}".format(float(i+1)/overall)
            message = m_f(-1)
            print(message, end=' ')
            for d in range(overall):
                self.Ylist.append(self.Y_normalized[self.ninan[:, d], d][:, None])
                print(' '*(len(message)+1) + '\r', end=' ')
                message = m_f(d)
                print(message, end=' ')
            print('')
        self.posterior = None
    def has_uncertain_inputs(self):
@ -245,8 +230,7 @@ class SparseGPMiniBatch(SparseGP):
            message = m_f(-1)
            print(message, end=' ')
-        for d in self.stochastics.d:
+        for d, ninan in self.stochastics.d:
            ninan = self.ninan[:, d]
            if not self.stochastics:
                print(' '*(len(message)) + '\r', end=' ')
@ -257,7 +241,7 @@ class SparseGPMiniBatch(SparseGP):
                grad_dict, current_values, value_indices = self._inner_parameters_changed(
                                self.kern, self.X[ninan],
                                self.Z, self.likelihood,
-                                self.Ylist[d], self.Y_metadata,
+                                self.Y_normalized[ninan][:, d], self.Y_metadata,
                                Lm, dL_dKmm,
                                subset_indices=dict(outputs=d, samples=ninan))
@ -266,8 +250,8 @@ class SparseGPMiniBatch(SparseGP):
            Lm = posterior.K_chol
            dL_dKmm = grad_dict['dL_dKmm']
-            woodbury_inv[:, :, d] = posterior.woodbury_inv
+            woodbury_inv[:, :, d] = posterior.woodbury_inv[:,:,None]
-            woodbury_vector[:, d:d+1] = posterior.woodbury_vector
+            woodbury_vector[:, d] = posterior.woodbury_vector
            self._log_marginal_likelihood += log_marginal_likelihood
        if not self.stochastics:
            print('')
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@ -3,7 +3,7 @@
 try:
-    import Tango
+    #import Tango
    import pylab as pb
 except:
    pass
@ -17,11 +17,11 @@ def ax_default(fignum, ax):
        fig = ax.figure
    return fig, ax
-def meanplot(x, mu, color=Tango.colorsHex['darkBlue'], ax=None, fignum=None, linewidth=2,**kw):
+def meanplot(x, mu, color='#3300FF', ax=None, fignum=None, linewidth=2,**kw):
    _, axes = ax_default(fignum, ax)
    return axes.plot(x,mu,color=color,linewidth=linewidth,**kw)
-def gpplot(x, mu, lower, upper, edgecol=Tango.colorsHex['darkBlue'], fillcol=Tango.colorsHex['lightBlue'], ax=None, fignum=None, **kwargs):
+def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, fignum=None, **kwargs):
    _, axes = ax_default(fignum, ax)
    mu = mu.flatten()
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@ -1,17 +1,14 @@
 # Copyright (c) 2012-2015, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 try:
    import Tango
    import pylab as pb
 except:
    pass
 import numpy as np
 from . import Tango
 from base_plots import gpplot, x_frame1D, x_frame2D
 from ...models.gp_coregionalized_regression import GPCoregionalizedRegression
 from ...models.sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
 from scipy import sparse
 from ...core.parameterization.variational import VariationalPosterior
 from matplotlib import pyplot as plt
 def plot_fit(model, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
@ -64,7 +61,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
    #if len(which_data_ycols)==0:
        #raise ValueError('No data selected for plotting')
    if ax is None:
-        fig = pb.figure(num=fignum)
+        fig = plt.figure(num=fignum)
        ax = fig.add_subplot(111)
    if hasattr(model, 'has_uncertain_inputs') and model.has_uncertain_inputs():
@ -126,7 +123,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
            print Ysim.shape
            print Xnew.shape
            for yi in Ysim.T:
-                plots['posterior_samples'] = ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
+                plots['posterior_samples'] = ax.plot(Xnew, yi[:,None], '#3300FF', linewidth=0.25)
                #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
        if samples_f: #NOTE not tested with fixed_inputs
@ -197,8 +194,8 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
        for d in which_data_ycols:
            m_d = m[:,d].reshape(resolution, resolution).T
-            plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
+            plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=plt.cm.jet)
-            if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=plt.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
        #set the limits of the plot to some sensible values
        ax.set_xlim(xmin[0], xmax[0])
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@ -22,7 +22,7 @@ def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
    lines = []
    fills = []
    bg_lines = []
-    means, variances = parameterized.mean, parameterized.variance
+    means, variances = parameterized.mean.values, parameterized.variance.values
    x = np.arange(means.shape[0])
    for i in range(means.shape[1]):
        if ax is None:
@ -43,7 +43,7 @@ def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
        if i < means.shape[1] - 1:
            a.set_xticklabels('')
    pb.draw()
-    fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
+    a.figure.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
    return dict(lines=lines, fills=fills, bg_lines=bg_lines)
 def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_side=True):
--- a/GPy/testing/cython_tests.py
+++ b/GPy/testing/cython_tests.py
@ -51,11 +51,16 @@ class test_stationary(np.testing.TestCase):
 class test_choleskies_backprop(np.testing.TestCase):
    def setUp(self):
-        self.dL, self.L = np.random.randn(2, 100, 100)
+        a =np.random.randn(10,12)
        A = a.dot(a.T)
        self.L = GPy.util.linalg.jitchol(A)
        self.dL = np.random.randn(10,10)
    def test(self):
        r1 = GPy.util.choleskies._backprop_gradient_pure(self.dL, self.L)
        r2 = GPy.util.choleskies.choleskies_cython.backprop_gradient(self.dL, self.L)
        r3 = GPy.util.choleskies.choleskies_cython.backprop_gradient_par_c(self.dL, self.L)
        np.testing.assert_allclose(r1, r2)
        np.testing.assert_allclose(r1, r3)
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@ -13,6 +13,7 @@ import GPy
 class InferenceXTestCase(unittest.TestCase):
    def genData(self):
        np.random.seed(1)
        D1,D2,N = 12,12,50
        x = np.linspace(0, 4 * np.pi, N)[:, None]
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -312,7 +312,12 @@ class KernelGradientTestsContinuous(unittest.TestCase):
        k = GPy.kern.LinearFull(self.D, self.D-1)
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
-
+        
    def test_standard_periodic(self):
        k = GPy.kern.StdPeriodic(self.D, self.D-1)
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 class KernelTestsMiscellaneous(unittest.TestCase):
    def setUp(self):
        N, D = 100, 10
--- a/GPy/testing/link_function_tests.py
+++ b/GPy/testing/link_function_tests.py
@ -7,6 +7,7 @@ _lim_val_exp = np.log(_lim_val)
 _lim_val_square = np.sqrt(_lim_val)
 _lim_val_cube = cbrt(_lim_val)
 from GPy.likelihoods.link_functions import Identity, Probit, Cloglog, Log, Log_ex_1, Reciprocal, Heaviside
 #np.seterr(over='raise')
 class LinkFunctionTests(np.testing.TestCase):
    def setUp(self):
--- a/GPy/testing/misc_tests.py
+++ b/GPy/testing/misc_tests.py
@ -1,6 +1,7 @@
 import numpy as np
 import scipy as sp
 import GPy
 import warnings
 class MiscTests(np.testing.TestCase):
    """
@ -11,8 +12,12 @@ class MiscTests(np.testing.TestCase):
        self._lim_val_exp = np.log(self._lim_val)
    def test_safe_exp_upper(self):
-        assert np.exp(self._lim_val_exp + 1) == np.inf
+        with warnings.catch_warnings(record=True) as w:
-        assert GPy.util.misc.safe_exp(self._lim_val_exp + 1) < np.inf
+            assert np.isfinite(np.exp(self._lim_val_exp))
            assert np.isinf(np.exp(self._lim_val_exp + 1))
            assert np.isfinite(GPy.util.misc.safe_exp(self._lim_val_exp + 1))
            assert len(w)==1 # should have one overflow warning
    def test_safe_exp_lower(self):
        assert GPy.util.misc.safe_exp(1e-10) < np.inf
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -361,14 +361,12 @@ class GradientTests(np.testing.TestCase):
        rbflin = GPy.kern.RBF(2) + GPy.kern.Linear(2)
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=2)
    # @unittest.expectedFailure
    def test_SparseGPRegression_rbf_linear_white_kern_2D_uncertain_inputs(self):
        ''' Testing the sparse GP regression with rbf, linear kernel on 2d data with uncertain inputs'''
        rbflin = GPy.kern.RBF(2) + GPy.kern.Linear(2)
        raise unittest.SkipTest("This is not implemented yet!")
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=2, uncertain_inputs=1)
    # @unittest.expectedFailure
    def test_SparseGPRegression_rbf_linear_white_kern_1D_uncertain_inputs(self):
        ''' Testing the sparse GP regression with rbf, linear kernel on 1d data with uncertain inputs'''
        rbflin = GPy.kern.RBF(1) + GPy.kern.Linear(1)
@ -385,6 +383,16 @@ class GradientTests(np.testing.TestCase):
        m = GPy.models.GPLVM(Y, input_dim, kernel=k)
        self.assertTrue(m.checkgrad())
    def test_BCGPLVM_rbf_bias_white_kern_2D(self):
        """ Testing GPLVM with rbf + bias kernel """
        N, input_dim, D = 50, 1, 2
        X = np.random.rand(N, input_dim)
        k = GPy.kern.RBF(input_dim, 0.5, 0.9 * np.ones((1,))) + GPy.kern.Bias(input_dim, 0.1) + GPy.kern.White(input_dim, 0.05)
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(N), K, input_dim).T
        m = GPy.models.BCGPLVM(Y, input_dim, kernel=k)
        self.assertTrue(m.checkgrad())
    def test_GPLVM_rbf_linear_white_kern_2D(self):
        """ Testing GPLVM with rbf + bias kernel """
        N, input_dim, D = 50, 1, 2
@ -410,23 +418,8 @@ class GradientTests(np.testing.TestCase):
        Z = np.linspace(0, 15, 4)[:, None]
        kernel = GPy.kern.RBF(1)
        m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, Z=Z)
        # distribution = GPy.likelihoods.likelihood_functions.Bernoulli()
        # likelihood = GPy.likelihoods.EP(Y, distribution)
        # m = GPy.core.SparseGP(X, likelihood, kernel, Z)
        # m.ensure_default_constraints()
        self.assertTrue(m.checkgrad())
    @unittest.expectedFailure
    def test_generalized_FITC(self):
        N = 20
        X = np.hstack([np.random.rand(N / 2) + 1, np.random.rand(N / 2) - 1])[:, None]
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        Y = np.hstack([np.ones(N / 2), np.zeros(N / 2)])[:, None]
        m = GPy.models.FITCClassification(X, Y, kernel=k)
        m.update_likelihood_approximation()
        self.assertTrue(m.checkgrad())
    @unittest.expectedFailure
    def test_multioutput_regression_1D(self):
        X1 = np.random.rand(50, 1) * 8
        X2 = np.random.rand(30, 1) * 5
@ -436,12 +429,11 @@ class GradientTests(np.testing.TestCase):
        Y = np.vstack((Y1, Y2))
        k1 = GPy.kern.RBF(1)
-        m = GPy.models.GPMultioutputRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel_list=[k1])
+        m = GPy.models.GPCoregionalizedRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel=k1)
-        import ipdb;ipdb.set_trace()
+        #import ipdb;ipdb.set_trace()
-        m.constrain_fixed('.*rbf_var', 1.)
+        #m.constrain_fixed('.*rbf_var', 1.)
        self.assertTrue(m.checkgrad())
    @unittest.expectedFailure
    def test_multioutput_sparse_regression_1D(self):
        X1 = np.random.rand(500, 1) * 8
        X2 = np.random.rand(300, 1) * 5
@ -451,8 +443,7 @@ class GradientTests(np.testing.TestCase):
        Y = np.vstack((Y1, Y2))
        k1 = GPy.kern.RBF(1)
-        m = GPy.models.SparseGPMultioutputRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel_list=[k1])
+        m = GPy.models.SparseGPCoregionalizedRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel=k1)
        m.constrain_fixed('.*rbf_var', 1.)
        self.assertTrue(m.checkgrad())
    def test_gp_heteroscedastic_regression(self):
--- a/GPy/util/choleskies_cython.c
+++ b/GPy/util/choleskies_cython.c
--- a/GPy/util/choleskies_cython.pyx
+++ b/GPy/util/choleskies_cython.pyx
@ -7,8 +7,9 @@
 import numpy as np
 from cython.parallel import prange, parallel
 cimport numpy as np
 cimport scipy.linalg.cython_blas as cblas
-def flat_to_triang(np.ndarray[double, ndim=2] flat, int M):
+def flat_to_triang(double[:, :] flat, int M):
    """take a matrix N x D and return a D X M x M array where
    N = M(M+1)/2
@ -18,90 +19,96 @@ def flat_to_triang(np.ndarray[double, ndim=2] flat, int M):
    cdef int D = flat.shape[1]
    cdef int N = flat.shape[0]
    cdef int count = 0
-    cdef np.ndarray[double, ndim=3] ret = np.zeros((D, M, M))
+    cdef double[:, :, ::1] ret = np.zeros((D, M, M))
    cdef int d, m, mm
-    for d in range(D):
+    with nogil:
-        count = 0
+        for d in range(D):
-        for m in range(M):
+            count = 0
-            for mm in range(m+1):
+            for m in range(M):
-                ret[d, m, mm] = flat[count,d]
+                for mm in range(m+1):
-                count += 1
+                    ret[d, m, mm] = flat[count,d]
                    count += 1
    return ret
-def triang_to_flat(np.ndarray[double, ndim=3] L):
+def triang_to_flat(double[:, :, :] L):
    cdef int D = L.shape[0]
    cdef int M = L.shape[1]
    cdef int N = M*(M+1)/2
    cdef int count = 0
-    cdef np.ndarray[double, ndim=2] flat = np.empty((N, D))
+    cdef double[:, ::1] flat = np.empty((N, D))
    cdef int d, m, mm
-    for d in range(D):
+    with nogil:
-        count = 0
+        for d in range(D):
-        for m in range(M):
+            count = 0
-            for mm in range(m+1):
+            for m in range(M):
-                flat[count,d] = L[d, m, mm]
+                for mm in range(m+1):
-                count += 1
+                    flat[count,d] = L[d, m, mm]
                    count += 1
    return flat
-
+def backprop_gradient(double[:, :] dL, double[:, :] L):
-def backprop_gradient(np.ndarray[double, ndim=2] dL, np.ndarray[double, ndim=2] L):
+    cdef double[:, ::1] dL_dK = np.tril(dL)
    cdef np.ndarray[double, ndim=2] dL_dK = np.tril(dL).copy()
    cdef int N = L.shape[0]
    cdef int k, j, i
-    for k in range(N - 1, -1, -1):
+    with nogil:
-        for j in range(k + 1, N):
+        for k in range(N - 1, -1, -1):
-            for i in range(j, N):
+            for j in range(k + 1, N):
-                dL_dK[i, k] -= dL_dK[i, j] * L[j, k]
+                for i in range(j, N):
-                dL_dK[j, k] -= dL_dK[i, j] * L[i, k]
+                    dL_dK[i, k] -= dL_dK[i, j] * L[j, k]
-        for j in range(k + 1, N):
+                    dL_dK[j, k] -= dL_dK[i, j] * L[i, k]
-            dL_dK[j, k] /= L[k, k]
+            for j in range(k + 1, N):
-            dL_dK[k, k] -= L[j, k] * dL_dK[j, k]
+                dL_dK[j, k] /= L[k, k]
-        dL_dK[k, k] /= (2. * L[k, k])
+                dL_dK[k, k] -= L[j, k] * dL_dK[j, k]
            dL_dK[k, k] /= (2. * L[k, k])
    return dL_dK
 def backprop_gradient_par(double[:,:] dL, double[:,:] L):
-    cdef double[:,:] dL_dK = np.tril(dL).copy()
+    cdef double[:,::1] dL_dK = np.tril(dL)
    cdef int N = L.shape[0]
    cdef int k, j, i
-    for k in range(N - 1, -1, -1):
+    with nogil:
-        with nogil, parallel():
+        for k in range(N - 1, -1, -1):
-            for i in prange(k + 1, N):
+            with parallel():
-                for j in range(k+1, i+1):
+                for i in prange(k + 1, N):
-                    dL_dK[i, k] -= dL_dK[i, j] * L[j, k]
+                    for j in range(k+1, i+1):
-                for j in range(i, N):
+                        dL_dK[i, k] -= dL_dK[i, j] * L[j, k]
-                    dL_dK[i, k] -= dL_dK[j, i] * L[j, k]
+                    for j in range(i, N):
-        for j in range(k + 1, N):
+                        dL_dK[i, k] -= dL_dK[j, i] * L[j, k]
-            dL_dK[j, k] /= L[k, k]
+            for j in range(k + 1, N):
-            dL_dK[k, k] -= L[j, k] * dL_dK[j, k]
+                dL_dK[j, k] /= L[k, k]
-        dL_dK[k, k] /= (2. * L[k, k])
+                dL_dK[k, k] -= L[j, k] * dL_dK[j, k]
            dL_dK[k, k] /= (2. * L[k, k])
    return dL_dK
-#here's a pure C version...
+cdef void chol_backprop(int N, double[:, ::1] dL, double[:, ::1] L) nogil:
-cdef extern from "cholesky_backprop.h" nogil:
+    cdef int i, k, n
    void chol_backprop(int N, double* dL, double* L)
-def backprop_gradient_par_c(np.ndarray[double, ndim=2] dL, np.ndarray[double, ndim=2] L):
+    # DSYMV required constant arguments
-    cdef np.ndarray[double, ndim=2] dL_dK = np.tril(dL) # makes a copy, c-contig
+    cdef double alpha=-1, beta=1
    cdef int incx=N
    # DSCAL required arguments
    cdef double scale
    dL[N - 1, N - 1] /= (2. * L[N - 1, N - 1])
    for k in range(N-2, -1, -1):
        n = N-k-1
        cblas.dsymv(uplo='u', n=&n, alpha=&alpha, a=&dL[k + 1, k + 1], lda=&N, x=&L[k + 1, k], incx=&incx,
                    beta=&beta, y=&dL[k + 1, k], incy=&N)
        for i in xrange(0, N - k - 1):
            dL[k + 1 + i, k] -= dL[k + i+ 1, k + i + 1] * L[k + 1 + i, k]
        scale = 1.0 / L[k, k]
        cblas.dscal(&n, &scale , &dL[k + 1, k], &N)
 #
        dL[k, k] -= cblas.ddot(&n, &dL[k + 1, k], &N, &L[k+1, k], &incx)
        dL[k, k] /= (2.0 * L[k, k])
 def backprop_gradient_par_c(double[:, :] dL, double[:, :] L):
    cdef double[:, ::1] dL_dK = np.tril(dL) # makes a copy, c-contig
    cdef double[:, ::1] L_cont = np.ascontiguousarray(L)
    cdef int N = L.shape[0]
    with nogil:
-        chol_backprop(N, <double*> dL_dK.data, <double*> L.data)
+        chol_backprop(N, dL_dK, L_cont)
-    return dL_dK
+    return np.asarray(dL_dK)
 cdef extern from "cholesky_backprop.h" nogil:
    void old_chol_backprop(int N, double* dL, double* L)
 def backprop_gradient_par_c_old(np.ndarray[double, ndim=2] dL, np.ndarray[double, ndim=2] L):
    cdef np.ndarray[double, ndim=2] dL_dK = np.tril(dL) # makes a copy, c-contig
    cdef int N = L.shape[0]
    with nogil:
        old_chol_backprop(N, <double*> dL_dK.data, <double*> L.data)
    return dL_dK
--- a/GPy/util/cholesky_backprop.c
+++ b/GPy/util/cholesky_backprop.c
@ -1,51 +0,0 @@
 #include <cblas.h>
 void chol_backprop(int N, double* dL, double* L){
    //at the input to this fn, dL is df_dL. after this fn is complet, dL is df_dK
    int i,k;
    dL[N*N - 1] /= (2. * L[N*N - 1]);
    for(k=N-2;k>(-1);k--){
        cblas_dsymv(CblasRowMajor, CblasLower,
 	            N-k-1, -1,
 		    &dL[(N*(k+1) + k+1)],N,
 		    &L[k*N+k+1],1,
 		    1, &dL[N*(k+1)+k], N);
        for(i=0;i<(N-k-1); i++){
 		dL[N*(k+1+i)+k] -= dL[N*(k+1)+k+i*(N+1)+1] * L[k*N+k+1+i];
 	}
 	cblas_dscal(N-k-1, 1.0/L[k*N+k], &dL[(k+1)*N+k], N);
        dL[k*N + k] -= cblas_ddot(N-k-1, &dL[(k+1)*N+k], N, &L[k*N+k+1], 1);
        dL[k*N + k] /= (2.0 * L[k*N + k]);
    }
 }
 double mydot(int n, double* a,  int stride_a, double* b, int stride_b){
    double ret = 0;	
    for(int i=0; i<n; i++){
 	ret += a[i*stride_a]*b[i*stride_b];
    }
    return ret;
 }
 void old_chol_backprop(int N, double* dL, double* U){
    //at the input to this fn, dL is df_dL. after this fn is complet, dL is df_dK
    int iN, kN,i,j,k;
    dL[N*N-1] /= (2. * U[N*N-1]);
    for(k=N-2;k>(-1);k--){
 	kN = k*N;
        #pragma omp parallel for private(i,iN)
        for(i=k+1; i<N; i++){
 	    iN = i*N;
 	    dL[iN+k] -= mydot(i-k, &dL[iN+k+1], 1, &U[kN+k+1], 1);
 	    dL[iN+k] -= mydot(N-i, &dL[iN+i], N, &U[kN+i],  1);
 	}
        for(i=(k + 1); i<N; i++){
 	    iN = i*N;
            dL[iN + k] /= U[kN + k];
            dL[kN + k] -= U[kN + i] * dL[iN + k];
 	}
        dL[kN + k] /= (2. * U[kN + k]);
    }
 }
--- a/GPy/util/cholesky_backprop.h
+++ b/GPy/util/cholesky_backprop.h
@ -1,5 +0,0 @@
 #include <cblas.h>
 void dsymv(int N, double*A, double*b, double*y);
 double mydot(int n, double* a,  int stride_a, double* b, int stride_b);
 void chol_backprop(int N, double* dL, double* L);
--- a/GPy/util/linalg_cython.c
+++ b/GPy/util/linalg_cython.c
--- a/GPy/util/linalg_cython.pyx
+++ b/GPy/util/linalg_cython.pyx
@ -1,3 +1,4 @@
 from libc.math cimport sqrt
 cimport numpy as np
 from cpython cimport bool
 import cython
@ -19,16 +20,18 @@ def symmetrify(np.ndarray[double, ndim=2] A, bool upper):
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
@cython.cdivision(True)
 def cholupdate(np.ndarray[double, ndim=1] x, np.ndarray[double, ndim=2] L, int N):
-    cdef double r
+    cdef double r, c, s
-    cdef double c
+    cdef int j, i
-    cdef double s
+
-    for j in xrange(N):
+    with nogil:
-        r = np.sqrt(L[j,j]*L[j,j] + x[j]*x[j])
+        for j in xrange(N):
-        c = r / L[j,j]
+            r = sqrt(L[j, j] * L[j, j] + x[j] * x[j])
-        s = x[j] / L[j,j]
+            c = r / L[j, j]
-        L[j,j] = r
+            s = x[j] / L[j, j]
-        for i in xrange(j):
+            L[j, j] = r
-            L[i,j] = (L[i,j] + s*x[i])/c
+            for i in xrange(j):
-            x[i] = c*x[i] - s*L[i,j];
+                L[i, j] = (L[i, j] + s * x[i]) / c
-        r = np.sqrt(L[j,j])
+                x[i] = c * x[i] - s * L[i, j]
            r = sqrt(L[j, j])
--- a/GPy/util/univariate_Gaussian.py
+++ b/GPy/util/univariate_Gaussian.py
@ -8,6 +8,7 @@ from scipy.special import ndtr as std_norm_cdf
 #define a standard normal pdf
 _sqrt_2pi = np.sqrt(2*np.pi)
 def std_norm_pdf(x):
    x = np.clip(x,-1e150,1e150)
    return np.exp(-np.square(x)/2)/_sqrt_2pi
 def inv_std_norm_cdf(x):
--- a/setup.py
+++ b/setup.py
@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 import os
 import sys
 from setuptools import setup, Extension
 import numpy as np
@ -20,10 +21,10 @@ ext_mods = [Extension(name='GPy.kern._src.stationary_cython',
                      extra_compile_args=compile_flags,
                      extra_link_args = ['-lgomp']),
            Extension(name='GPy.util.choleskies_cython',
-                      sources=['GPy/util/choleskies_cython.c', 'GPy/util/cholesky_backprop.c'],
+                      sources=['GPy/util/choleskies_cython.c'],
                      include_dirs=[np.get_include()],
-                      extra_link_args = ['-lgomp', '-lblas'],
+                      extra_link_args = ['-lgomp'],
-                      extra_compile_args=compile_flags+['-std=c99']),
+                      extra_compile_args=compile_flags),
            Extension(name='GPy.util.linalg_cython',
                      sources=['GPy/util/linalg_cython.c'],
                      include_dirs=[np.get_include()],
@ -63,7 +64,7 @@ setup(name = 'GPy',
      py_modules = ['GPy.__init__'],
      test_suite = 'GPy.testing',
      long_description=read('README.md'),
-      install_requires=['numpy>=1.7', 'scipy>=0.12'],
+      install_requires=['numpy>=1.7', 'scipy>=0.16'],
      extras_require = {'docs':['matplotlib >=1.3','Sphinx','IPython']},
      classifiers=['License :: OSI Approved :: BSD License',
                   'Natural Language :: English',